In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import expit
import json
from tqdm import tqdm
import torch

## Load the model

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
model_path = f"cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
class_mapping = model.config.id2label

In [4]:
f = open('../../autodl-fs/twitter_huge.json','rb')
twitter_huge = json.load(f)
f.close()

In [5]:
print(twitter_huge["total_rows"])

734185


In [6]:
twitter_huge["rows"][0]

{'id': '00466228e947996e9feef13ca1000903',
 'key': '00466228e947996e9feef13ca1000903',
 'value': {'rev': '1-2c989e80f6f20b0fb9c788f18955a5ab'},
 'doc': {'_id': '00466228e947996e9feef13ca1000903',
  '_rev': '1-2c989e80f6f20b0fb9c788f18955a5ab',
  'Author_Id': '35705319',
  'Sentiment_Score': 0.0,
  'Text': "@fayemikah Oestradiol! You're not a Yank!",
  'Language': 'en',
  'Coordinate': [145.053135344, -37.972566514250005],
  'Suburb': 'Cheltenham'}}

## Predict the topic of each tweet

In [13]:
processed_tweets = []
for twitter_data in tqdm(twitter_huge["rows"]):
    try:
        authod_id = twitter_data["doc"]["Author_Id"]
        tweet_text = twitter_data["doc"]["Text"]
        sentiment_score = twitter_data["doc"]["Sentiment_Score"]
        suburb = twitter_data["doc"]["Suburb"]
        lang = twitter_data["doc"]["Language"]
    except:
        continue
    authod_id = twitter_data["doc"]["Author_Id"]
    tweet_text = twitter_data["doc"]["Text"]
    sentiment_score = twitter_data["doc"]["Sentiment_Score"]
    suburb = twitter_data["doc"]["Suburb"]
    lang = twitter_data["doc"]["Language"]
    topics = []

    tokens = tokenizer(tweet_text, return_tensors='pt', max_length=512).to(device)
    output = model(**tokens)
    output = {key: value.to("cpu") for key, value in output.items()}
    scores = output["logits"][0].detach().numpy()
    scores = expit(scores)
    predictions = (scores >= 0.5) * 1

    for i in range(len(predictions)):
        if predictions[i]:
            topics.append(class_mapping[i])
    
    if len(topics) == 0:
        continue

    tweet_info = {
        "Author_ID" : authod_id,
        "Text" : tweet_text,
        "Sentiment_Score" : sentiment_score,
        "Topics" : topics,
        "Suburb" : suburb,
        "Language" : lang,
    }

    processed_tweets.append(tweet_info)


100%|██████████| 734185/734185 [1:55:31<00:00, 105.91it/s] 


In [14]:
processed_tweets[:10]

[{'Author_ID': '35705319',
  'Text': "@fayemikah Oestradiol! You're not a Yank!",
  'Sentiment_Score': 0.0,
  'Topics': ['news_&_social_concern'],
  'Suburb': 'Cheltenham',
  'Language': 'en'},
 {'Author_ID': '1317491197',
  'Text': 'When in Docklands 🤟🏻 https://t.co/9KlSxQcVgi',
  'Sentiment_Score': 0.0,
  'Topics': ['travel_&_adventure'],
  'Suburb': 'Docklands',
  'Language': 'en'},
 {'Author_ID': '18147028',
  'Text': 'Atheldene Drive , St Albans - Road Closed, A single car has collided with a power pole.  Police have… https://t.co/YnDkEnX91D #victraffic',
  'Sentiment_Score': 0.0,
  'Topics': ['news_&_social_concern'],
  'Suburb': 'Cheltenham',
  'Language': 'en'},
 {'Author_ID': '35705319',
  'Text': "@Goose_xt @fayemikah MFer thinks she's Keffals…",
  'Sentiment_Score': 0.0,
  'Topics': ['celebrity_&_pop_culture'],
  'Suburb': 'Cheltenham',
  'Language': 'en'},
 {'Author_ID': '15510386',
  'Text': "@MoarToast That's a great pic btw. 😍",
  'Sentiment_Score': 0.8571428571428571,
 

## Save the processed twitter data into json file

In [15]:
with open("processed_twitter2.json", "w") as final:
    json.dump(processed_tweets, final)

In [16]:
f = open('processed_twitter2.json','rb')
mydata = json.load(f)
f.close()

In [19]:
mydata[165161]

{'Author_ID': '2298704174',
 'Text': '@smudge_green @dezmac_au @AlboMP Come @ me smudge, you have a big mouth but no doubt very little brain 🥥🥥🥥🥥',
 'Sentiment_Score': -0.047619047619047616,
 'Topics': ['news_&_social_concern'],
 'Suburb': 'Cheltenham',
 'Language': 'en'}