## Import Libraries

In [1]:
!pip install transformers==4.3.3
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from transformers import AutoModel, AutoTokenizer 
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

Collecting transformers==4.3.3
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 7.6MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 43.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 47.0MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.3.3


In [4]:
raw_df = pd.read_csv('train_with_sarcasm.csv')
raw_df.head()

Unnamed: 0.1,Unnamed: 0,content,Pro Trump,Pro Biden,Neutral,one_hot_labels,sarcasm_labels
0,0,True #Trump got to the white working underclas...,0,0,1,[0 0 1],regular
1,1,Uh oh! @Twitter stops me from posting new emai...,1,0,0,[1 0 0],regular
2,2,Gildan Now Available At The Lowest Price Of Th...,0,0,1,[0 0 1],regular
3,3,"So happy for our fellow Americans ,yâ€™all vot...",0,1,0,[0 1 0],regular
4,4,@NZim22 @CollinKelley @Fred_Jackson__ @realDon...,1,0,0,[1 0 0],regular


In [12]:
print(raw_df.shape)
df = raw_df[["content", "Pro Trump", "Pro Biden", "Neutral", "sarcasm_labels"]]
df = df[df['content'].notna()]
df = df[df['Pro Biden'].notna()]
df = df[df['Pro Trump'].notna()]
df = df[df['Neutral'].notna()]
df = df[df['sarcasm_labels'].notna()]
print(df.shape)
df = df.astype({"Pro Trump": int, "Pro Biden": int, "Neutral": int})
df.drop_duplicates(subset='content', keep='first', inplace=True)
print(df.shape)
df.head()

(1979, 7)
(1979, 5)
(1979, 5)


Unnamed: 0,content,Pro Trump,Pro Biden,Neutral,sarcasm_labels
0,True #Trump got to the white working underclas...,0,0,1,regular
1,Uh oh! @Twitter stops me from posting new emai...,1,0,0,regular
2,Gildan Now Available At The Lowest Price Of Th...,0,0,1,regular
3,"So happy for our fellow Americans ,yâ€™all vot...",0,1,0,regular
4,@NZim22 @CollinKelley @Fred_Jackson__ @realDon...,1,0,0,regular


In [None]:
df["sarcasm"] = 0

for i in range(len(df)):
    if df["sarcasm_labels"].iloc[i] == "sarcasm":
        df["sarcasm"].iloc[i] = 1
    elif df["sarcasm_labels"].iloc[i] != "regular":
        print(f"ERROR - unexpected label at index {i}")
        break

In [15]:
df = df[["content", "Pro Trump", "Pro Biden", "Neutral", "sarcasm"]]
df.head(10)

Unnamed: 0,content,Pro Trump,Pro Biden,Neutral,sarcasm
0,True #Trump got to the white working underclas...,0,0,1,0
1,Uh oh! @Twitter stops me from posting new emai...,1,0,0,0
2,Gildan Now Available At The Lowest Price Of Th...,0,0,1,0
3,"So happy for our fellow Americans ,yâ€™all vot...",0,1,0,0
4,@NZim22 @CollinKelley @Fred_Jackson__ @realDon...,1,0,0,0
5,Maroon 5 - Memories (Cover by Donald Trump) ht...,0,1,0,0
6,@Hk18mm Chung has become a hack on #FakeNews #...,0,0,1,0
7,"â€œNo matter how corrupt, greedy, and heartles...",0,0,1,0
8,@RealJamesWoods We found #Biden’s voters... 💀😬...,1,0,0,0
9,"Of course, Republicans who were ultra-concerne...",0,1,0,0


In [16]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

## Load and Preprocess Training Data

Dataset will be tokenized then split into training and validation sets. The validation set will be used to monitor training. For testing a separate test set will be loaded for analysis.

In [18]:
print('Unique comments: ', df.content.nunique() == df.shape[0])
print('Null values: ', df.isnull().values.any())
# df[df.isna().any(axis=1)]

Unique comments:  True
Null values:  False


In [19]:
print('average sentence length: ', df.content.str.split().str.len().mean())
print('stdev sentence length: ', df.content.str.split().str.len().std())

average sentence length:  21.41232945932289
stdev sentence length:  12.918652287556585


In [20]:
cols = df.columns
label_cols = ['Pro Trump', 'Pro Biden', 'Neutral']
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['Pro Trump', 'Pro Biden', 'Neutral']


In [21]:
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') # Label counts, may need to downsample or upsample
print('Count of 0 per label: \n', df[label_cols].eq(0).sum())

Count of 1 per label: 
 Pro Trump    502
Pro Biden    717
Neutral      760
dtype: int64 

Count of 0 per label: 
 Pro Trump    1477
Pro Biden    1262
Neutral      1219
dtype: int64


In [22]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows

In [23]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,content,Pro Trump,Pro Biden,Neutral,sarcasm,one_hot_labels
0,What is Joe Biden's plan to secure a caguama f...,0,0,1,0,"[0, 0, 1]"
1,@Pismo_B Go Trump\n#MAGA,1,0,0,0,"[1, 0, 0]"
2,@jilevin @JoeBiden is a fraud.,1,0,0,0,"[1, 0, 0]"
3,Congratulations to Mr @JoeBiden for being elec...,0,1,0,0,"[0, 1, 0]"
4,@Jonathan_Cahn The example set by the leader o...,0,0,1,0,"[0, 0, 1]"


In [24]:
labels = list(df.one_hot_labels.values)
comments = list(df.content.values)

Load the pretrained tokenizer that corresponds to your choice in model. e.g.,

```
BERT:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) 

XLNet:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=False) 

RoBERTa:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
```


In [25]:
max_length = 100
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.







tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
# len(encodings['token_type_ids'][0]), len(encodings['token_type_ids'][100])

In [26]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

In [27]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  []


In [28]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

Be sure to handle all classes during validation using "stratify" during train/validation split:

In [29]:
# Use train_test_split to split our data into train and validation sets

train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids,attention_masks,
                                                            random_state=2020, test_size=0.20, stratify = labels)

# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [30]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 8

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [31]:
torch.save(validation_dataloader,'validation_data_loader')
torch.save(train_dataloader,'train_data_loader')

## Load Model & Set Params

Load the appropriate model below, each model already contains a single dense layer for classification on top.



```
BERT:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

XLNet:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=num_labels)

RoBERTa:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
```



In [33]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
# model = AutoModel.from_pretrained("vinai/bertweet-base", num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
# model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=num_labels)
# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
# model.load_state_dict(torch.load('./models/bert_main'))
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Setting custom optimization parameters for the AdamW optimizer https://huggingface.co/transformers/main_classes/optimizer_schedules.html

In [34]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'gamma', 'beta']
# no_decay = []
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [35]:
optimizer = AdamW(optimizer_grouped_parameters,lr=3e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train Model

In [38]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 8

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # print(b_labels)
    # # Forward pass for multiclass classification
    # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    # loss = outputs[0]
    # logits = outputs[1]

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    loss_func = BCELoss() 
    loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.5
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Train loss: 0.043286610629898734


Epoch:  12%|█▎        | 1/8 [00:22<02:36, 22.36s/it]

F1 Validation Accuracy:  64.11238825031928
Flat Validation Accuracy:  62.62626262626263
Train loss: 0.041777306048180717


Epoch:  25%|██▌       | 2/8 [00:44<02:14, 22.35s/it]

F1 Validation Accuracy:  64.9550706033376
Flat Validation Accuracy:  63.63636363636363
Train loss: 0.01954383764891048


Epoch:  38%|███▊      | 3/8 [01:07<01:51, 22.35s/it]

F1 Validation Accuracy:  63.07692307692307
Flat Validation Accuracy:  61.86868686868687
Train loss: 0.020440884222596357


Epoch:  50%|█████     | 4/8 [01:29<01:29, 22.35s/it]

F1 Validation Accuracy:  63.8676844783715
Flat Validation Accuracy:  63.13131313131313
Train loss: 0.017878868431323254


Epoch:  62%|██████▎   | 5/8 [01:51<01:07, 22.35s/it]

F1 Validation Accuracy:  66.92111959287531
Flat Validation Accuracy:  65.9090909090909
Train loss: 0.014705763044978747


Epoch:  75%|███████▌  | 6/8 [02:14<00:44, 22.36s/it]

F1 Validation Accuracy:  66.66666666666666
Flat Validation Accuracy:  65.65656565656566
Train loss: 0.006847535342335551


Epoch:  88%|████████▊ | 7/8 [02:36<00:22, 22.35s/it]

F1 Validation Accuracy:  65.56962025316456
Flat Validation Accuracy:  64.39393939393939
Train loss: 0.009336705527131945


Epoch: 100%|██████████| 8/8 [02:58<00:00, 22.36s/it]

F1 Validation Accuracy:  63.68286445012787
Flat Validation Accuracy:  62.878787878787875





In [37]:
torch.save(model.state_dict(), 'bert_model_toxic')

## Load and Preprocess Test Data

In [None]:
#DELETE LATER
# test_df = pd.read_csv('testset_new.csv')
test_df = pd.read_csv('test_main.csv')
print(test_df.shape)
test_df.dropna()
print(test_df.shape)
# test_df.drop(columns='content', inplace=True)
# test_df = test_df.rename(columns={'content': 'tweet', 'Against Trump': 'Anti Trump', 'Against Biden': 'Anti Biden'})
test_label_cols = ['Pro Trump', 'Pro Biden', 'Neutral']
test_df['Pro Trump'] = np.nan
test_df['Pro Biden'] = np.nan
test_df['Neutral'] = np.nan

print(test_label_cols)
test_df.head()

In [None]:
test_df = test_df[~test_df[test_label_cols].eq(-1).any(axis=1)] #remove irrelevant rows/comments with -1 values
test_df['one_hot_labels'] = list(test_df[test_label_cols].values)
test_df.head()

In [None]:
# Gathering input data
test_labels = list(test_df.one_hot_labels.values)
test_comments = list(test_df.content.values)
# print(test_comments)

In [None]:
# Encoding input data
%%time
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

In [None]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)
# Save test dataloader
torch.save(test_dataloader,'test_data_loader')

## Prediction and Metics

In [None]:
# Test
# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

We need to threshold our sigmoid function outputs which range from [0, 1]. Below I use 0.50 as a threshold.

# Metrics


In [None]:
pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
# print('F1 Validation Accuracy: ', val_f1_accuracy)
# print('Flat Validation Accuracy: ', val_flat_accuracy,'\n')
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=test_label_cols)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

Test F1 Accuracy:  0.0
Test Flat Accuracy:  0.04619033965018377 

              precision    recall  f1-score   support

   Pro Trump       0.00      0.00      0.00         0
   Pro Biden       0.00      0.00      0.00         0
     Neutral       0.00      0.00      0.00         0

   micro avg       0.00      0.00      0.00         0
   macro avg       0.00      0.00      0.00         0
weighted avg       0.00      0.00      0.00         0
 samples avg       0.00      0.00      0.00         0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Output Dataframe

In [None]:
idx2label = dict(zip(range(6),label_cols))
print(idx2label)

In [None]:
# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
for vals in true_bools:
  true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [None]:
# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
  if vals:
    true_label_texts.append([idx2label[val] for val in vals])
  else:
    true_label_texts.append(vals)

for vals in pred_label_idxs:
  if vals:
    pred_label_texts.append([idx2label[val] for val in vals])
  else:
    pred_label_texts.append(vals)

In [None]:
# Decoding input ids to comment text
comment_texts = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [None]:
# Converting lists to df
comparisons_df = pd.DataFrame({'comment_text': comment_texts, 'true_labels': true_label_texts, 'pred_labels':pred_label_texts})
comparisons_df.to_csv('comparisons.csv')
comparisons_df.sample(20)

In [None]:
def temp(x):
  if x:
    return x[0]
  else:
    return np.nan

test_df['sentiment'] = comparisons_df['pred_labels']
test_df['sentiment'] = test_df['sentiment'].apply(lambda col: temp(col))
test_df.head()

In [None]:
print(test_df.shape)
test_df.dropna(subset=['sentiment'], inplace=True)
print(test_df.shape)

In [None]:
test_df.groupby(['country', 'sentiment']).agg(['count', 'sum'])

In [None]:
test_df = test_df[test_df['sentiment'] != 'Neutral']
print(test_df.shape)

In [None]:
del test_df['Pro Trump']
del test_df['Pro Biden']
del test_df['Neutral']
del test_df['one_hot_labels']

In [None]:
test_df.head()

In [None]:
test_df.to_csv('tempBeforeSolr.csv')

Doing this may result in a trade offs between precision, flat accuracy and micro F1 accuracy. You may tune the threshold however you want.

In [None]:
# Calculate Accuracy - maximize F1 accuracy by tuning threshold values. First with 'macro_thresholds' on the order of e^-1 then with 'micro_thresholds' on the order of e^-2

macro_thresholds = np.array(range(1,10))/10

f1_results, flat_acc_results = [], []
for th in macro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  f1_results.append(test_f1_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_macro_th = macro_thresholds[np.argmax(f1_results)] #best macro threshold value

micro_thresholds = (np.array(range(10))/100)+best_macro_th #calculating micro threshold values

f1_results, flat_acc_results = [], []
for th in micro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  f1_results.append(test_f1_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_f1_idx = np.argmax(f1_results) #best threshold value

# Printing and saving classification report
print('Best Threshold: ', micro_thresholds[best_f1_idx])
print('Test F1 Accuracy: ', f1_results[best_f1_idx])
print('Test Flat Accuracy: ', flat_acc_results[best_f1_idx], '\n')

best_pred_bools = [pl>micro_thresholds[best_f1_idx] for pl in pred_labels]
clf_report_optimized = classification_report(true_bools,best_pred_bools, target_names=label_cols)
pickle.dump(clf_report_optimized, open('classification_report_optimized.txt','wb'))
print(clf_report_optimized)