In [1]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
import re
import joblib
import pickle

In [2]:
# pathToDataset = "./Datasets/JoeBiden_stevewoz_katyperry_balanced_dataset.csv"
# pathToModel = "app/models/KP_JB_SW_best_bert.pt"
# name = "KP_JB_SW_best_bert"

pathToDataset = "./Datasets/BarackObama_elonmusk_Oprah_dataset.csv"
pathToModel = "app/models/EM_BO_O_bert_best.pt"
name = "EM_BO_O_bert_best"


In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

device(type='cuda')

In [4]:
df = pd.read_csv(pathToDataset)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,User,Tweet
0,11000,11000,elonmusk,@ATeslaInICEland @Tesla Thanks for letting me ...
1,10349,10349,Oprah,@maryjblige great to see you sis Mary' even th...
2,483,483,BarackObama,Republicans love to say right before an electi...
3,5180,5180,elonmusk,Beautiful fireworks in LA tonight
4,9078,9078,BarackObama,"President Obama: “Madam Chairwoman, delegates,..."


In [5]:
authors = df['User'].unique()
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {authors[i] : i for i in range(len(authors))}

pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))

batch_size = 8
labels

{'elonmusk': 0, 'Oprah': 1, 'BarackObama': 2}

In [6]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, len(authors))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [7]:
def sample(tweet):
    token = tokenizer(re.sub(r"http\S+", '', tweet),  padding='max_length', max_length = 512, truncation=True,
                     return_tensors="pt")
    
    mask = token['attention_mask']# .to(device)
    token = token['input_ids']# .to(device)

    output = model(token, mask).argmax(dim=1).item()
    return output, list(labels.keys())[list(labels.values()).index(model(token, mask).argmax(dim=1).item())]

In [8]:
model = BertClassifier()
model.load_state_dict(torch.load(pathToModel))
model.eval()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [9]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['User']]
        self.texts = [tokenizer(re.sub(r"http\S+", '', text),  padding='max_length', max_length = 512, truncation=True,
                     return_tensors="pt") for text in df['Tweet']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def evaluate(model, test_data):

    predictions = []
    labels = []

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.to(device)

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              for i in range(len(test_label)):
                  labels.append(torch.Tensor.numpy(torch.Tensor.cpu(test_label))[i])

              for i in range(len(test_label)):
                  predictions.append(torch.Tensor.numpy(torch.Tensor.cpu(output.argmax(dim=1)))[i])
              
              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    cm = confusion_matrix(labels, predictions)
    cr = classification_report(labels, predictions)

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
    return cm, cr

In [11]:
np.random.seed(112)
df_train, df_test, df_val = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

32178 4023 4022


In [12]:
tweet = "This is an extreme and dangerous path the Court is now taking us on."
sample(tweet)

(0, 'elonmusk')

In [13]:
joblib.dump(model, name + ".pkl")

['EM_BO_O_bert_best.pkl']

In [14]:
cr, cm = evaluate(model, df_test)

Test Accuracy:  0.970


In [15]:
print(cr)
print(cm)

[[1373   32   12]
 [  35 1192   20]
 [   4   17 1337]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1417
           1       0.96      0.96      0.96      1247
           2       0.98      0.98      0.98      1358

    accuracy                           0.97      4022
   macro avg       0.97      0.97      0.97      4022
weighted avg       0.97      0.97      0.97      4022

