# Imports

In [1]:
import sys
print(sys.version)
print(sys.path)

3.10.2 (main, Jan 15 2022, 19:56:27) [GCC 11.1.0]
['/home/zander/everything/projects/ProjectX-2021/tweet-legitmacy-classifier', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/zander/.local/lib/python3.10/site-packages', '/usr/lib/python3.10/site-packages']


In [None]:
import numpy as np
import pandas as pd
import regex as re
import string

import torch
import torch.nn as nn

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

import random

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

### Constants and helpers

In [None]:
target_names = ['legitimate','misinformation','irrelevant']
max_length = 96
stop = stopwords.words('english')

In [None]:
def clean_text(row):
    # Lower case
    row = row.lower()
    
    # Remove URLs
    row = re.sub('http\S+|www.\S+', '', row)
    
    # Remove @mentions
    row = re.sub('@[A-Za-z0-9]+', '', row)
    
    # Remove non-standard characters
    row = row.encode("ascii", "ignore").decode()
    
    # Remove punctuation
    row = row.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stop words
    pat = r'\b(?:{})\b'.format('|'.join(stop))
    row = row.replace(pat, '')
    row = row.replace(r'\s+', ' ')
    
    # Remove extraneous whitespace
    row = row.strip()
    
    # Lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    w_tokenization = nltk.word_tokenize(row)
    final = ""
    for w in w_tokenization:
        final = final + " " + wordnet_lemmatizer.lemmatize(w)
    
    return final

# Model

In [None]:
PATH1 = 'models/first-augmented-miscov19-covid-twitter-bert-v2'
tokenizer1 = AutoTokenizer.from_pretrained(PATH1, local_files_only=True)
model1 = AutoModelForSequenceClassification \
        .from_pretrained(PATH1, num_labels=len(target_names), local_files_only=True).to("cuda")

PATH2 = 'models/second-augmented-miscov19-covid-twitter-bert-v2'
tokenizer2 = AutoTokenizer.from_pretrained(PATH2, local_files_only=True)
model2 = AutoModelForSequenceClassification \
        .from_pretrained(PATH2, num_labels=len(target_names), local_files_only=True).to("cuda")

PATH3 = 'models/third-augmented-miscov19-covid-twitter-bert-v2'
tokenizer3 = AutoTokenizer.from_pretrained(PATH3, local_files_only=True)
model3 = AutoModelForSequenceClassification \
        .from_pretrained(PATH3, num_labels=len(target_names), local_files_only=True).to("cuda")

PATH4 = 'models/fourth-augmented-miscov19-covid-twitter-bert-v2'
tokenizer4 = AutoTokenizer.from_pretrained(PATH4, local_files_only=True)
model4 = AutoModelForSequenceClassification \
        .from_pretrained(PATH4, num_labels=len(target_names), local_files_only=True).to("cuda")


In [None]:
class BaggedTweetClassifier(nn.Module):
    def __init__(self):
        super(BaggedTweetClassifier, self).__init__()
        self.tok1 = tokenizer1
        self.bert1 = model1
        
        self.tok2 = tokenizer2
        self.bert2 = model2
        
        self.tok3 = tokenizer3
        self.bert3 = model3
        
        self.tok4 = tokenizer4
        self.bert4 = model4

    def forward(self, x, debug=False):
        x = clean_text(x)
        
        in1 = self.tok1(x, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
        in2 = self.tok2(x, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
        in3 = self.tok3(x, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
        in4 = self.tok4(x, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
        
        out1 = self.bert1(**in1)
        out2 = self.bert2(**in2)
        out3 = self.bert3(**in3)
        out4 = self.bert4(**in4)
        
        probs1 = out1[0].softmax(1)
        probs2 = out2[0].softmax(1)
        probs3 = out3[0].softmax(1)
        probs4 = out4[0].softmax(1)
        
        avg_prob = (probs1 + probs2 + probs3 + probs4) / 4.0
        
        if(debug):
            print(f'-------------------------------------------------------------')
            print(f'| label        | model1 | model2 | model3 | model4 | avg_pr |')
            print(f'| legitimate   | {probs1[0][0].item():.4f} | {probs2[0][0].item():.4f} | {probs3[0][0].item():.4f} | {probs4[0][0].item():.4f} | {avg_prob[0][0].item():.4f} |')
            print(f'| misinfo      | {probs1[0][1].item():.4f} | {probs2[0][1].item():.4f} | {probs3[0][1].item():.4f} | {probs4[0][1].item():.4f} | {avg_prob[0][1].item():.4f} |')
            print(f'| irrelevant   | {probs1[0][2].item():.4f} | {probs2[0][2].item():.4f} | {probs3[0][2].item():.4f} | {probs4[0][2].item():.4f} | {avg_prob[0][2].item():.4f} |')
            print(f'-------------------------------------------------------------')
            return target_names[avg_prob.argmax()]
        
        return avg_prob.argmax()
    

In [None]:
model = BaggedTweetClassifier()

### Accuracy and Confusion Matrix on MisCov19 Dataset

In [None]:
raw_df = pd.read_csv('miscov19_p.csv')
df = raw_df[['text','label']]
df.dropna()
df['text'] = df['text'].astype(str)
df.tail()

In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
df['preds'] = df['text'].apply(model)

In [None]:
def to_cpu(x):
    return x.cpu()

In [None]:
df['preds'] = df['preds'].apply(to_cpu)

In [None]:
y_true = df['label'].tolist()
y_pred = df['preds'].tolist()

In [None]:
print(f'Ensemble Accuracy: {accuracy_score(y_true, y_pred)}')

In [None]:
confusion_mat = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_mat, display_labels=target_names)
disp.plot(cmap='inferno_r')
plt.show()

### Accuracy and Confusion Matrix on Augmented MisCov19 Dataset

In [None]:
raw_df = pd.read_csv('combined_data.csv')
df = raw_df[['text','label']]
df.dropna()
df['text'] = df['text'].astype(str)
df.tail()

In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
df['preds'] = df['text'].apply(model)

In [None]:
df['preds'] = df['preds'].apply(to_cpu)

In [None]:
y_true2 = df['label'].tolist()
y_pred2 = df['preds'].tolist()

In [None]:
print(f'Ensemble Accuracy (Augmented): {accuracy_score(y_true2, y_pred2)}')

In [None]:
confusion_mat2 = confusion_matrix(y_true2, y_pred2)
disp2 = ConfusionMatrixDisplay(confusion_mat2, display_labels=target_names)
disp2.plot(cmap='inferno_r')
plt.show()

### Cherry picked examples

In [None]:
# Example #1
text = "DP Dough is the best restaurant in College Town"
print(model(text, debug=True))

In [None]:
# Example #2
text2 = "Vaccines cause autism"
print(model(text2, debug=True))

In [None]:
# Example #3
text3 = "Vaccines prevent over 90% of Covid infections! #Science"
print(model(text3, debug=True))

In [None]:
# Example #4
text4 = "Vaccines will end the pandemic"
print(model(text4, debug=True))

In [None]:
# Example #5
text5 = "scientists say kaitlyn will prevent covid"
print(model(text5, debug=True))

In [None]:
# Example #6
text6 = "Biden says vaccines prevent over 90% of Covid infections!"
print(model(text6, debug=True))

In [None]:
# Example #7
text7 = "Biden says vaccines cause autism!"
print(model(text7, debug=True))

In [None]:
# Example #8
text8 = "In Portugal, with 89% of the total population fully vaccinated, almost 90% of UCI Covid patients are unvaccinated"
print(model(text8, debug=True))

In [None]:
# Example #9
text9 = "President Trump has covid"
print(model(text9, debug=True))

In [None]:
# Example #10
text10 = "Vaccines don't stop you from getting covid."
print(model(text10, debug=True))

In [None]:
# Example #11
text11 = "Vaccinations stop you from getting covid."
print(model(text11, debug=True))

In [None]:
# Example #12
text12 = "Masks are effective"
print(model(text12, debug=True))

In [None]:
# Example #13
text13 = "Vaccines are effective"
print(model(text13, debug=True))

In [None]:
# Example #14
text14 = "Essential oils are effective"
print(model(text14, debug=True))

In [None]:
# Example #15
text15 = '''The Omicron variant of the coronavirus is causing Covid-19 cases to spike, 
            with an average of more than 747,000 new cases a day, according to the latest 
            numbers from Johns Hopkins University'''
print(model(text15, debug=True))

In [None]:
# Example #16
text16 = '''Kim Kramer has heart failure, which puts her at higher risk of having a severe case 
            of Covid-19.'''
print(model(text16, debug=True))