In [1]:
import json
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [2]:
# Get device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [4]:
model_path = "results/checkpoint-1200"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
test_trainer = Trainer(model)

In [5]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [6]:
#remove punctuation
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

table = str.maketrans('', '', string.punctuation)

# Remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')

# Lemmatize all words
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/gabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gabe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
maybe_bad_tweets = {}
maybe_bad_tweets["tweets"] = []
for i in range(1000):
    try:
        followers_timeline_file = open("../../data/followers_timeline_{}.json".format(str(i)), 'r')
        print("Working on file {}".format(str(i)))
    except:
        print("File not found")
        break
    followers_timeline = json.load(followers_timeline_file)
    for user in followers_timeline["followers_timeline"]:
        for follower in followers_timeline["followers_timeline"][user]:
            try:
                tweet_timeline = followers_timeline["followers_timeline"][user][follower]["data"]
                tweet_text_list = []
                for tweet in tweet_timeline:
                        tweet_text = tweet['text']
                        tweet_text = ' '.join([word for word in tweet_text.lower().split() if word not in stop])  # remove stopwords
                        # remove punc.
                        tweet_text = tweet_text.translate(table)
                        tweet_text = ''.join(c for c in tweet_text if not c.isdigit())  # remove numbers
                        tweet_text = tweet_text.replace('  ', ' ')  # remove double spaces
                        tweet_text = ' '.join([lemmatizer.lemmatize(word) for word in tweet_text.split()])  # lemminize
                        tweet_text_list.append(tweet_text)
                X_test_tokenized = tokenizer(tweet_text_list, padding=True, truncation=True, max_length=512)
                test_dataset = Dataset(X_test_tokenized)
                raw_pred, _, _ = test_trainer.predict(test_dataset)
                y_pred = np.argmax(raw_pred, axis=1)
                counter = 0
                for tweet_pred in y_pred:
                    if tweet_pred == 1:
                        print(tweet_text_list[counter])
                        maybe_bad_tweets["tweets"].append(tweet_timeline[counter])
                    counter += 1
                        
            except:
                pass

In [None]:
# Write maybe bad tweets to file
with open("maybe_bad_tweets.json", 'w') as outfile:
	json.dump(maybe_bad_tweets, outfile)