In [None]:
!pip install torchnlp



In [None]:
!pip install pytorch_pretrained_bert



In [None]:
!pip install datasets



1. **IMPORTING PACKAGES**

In [None]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

 2.**DATA EXPLORATION**

In [None]:
data = pd.read_csv("task_3a_sample_data.csv", delimiter='\t', encoding='utf-8')
data.drop(["title"], axis=1, inplace=True)
data.drop(["public_id"],axis=1, inplace=True)
print (data)

                                                 text       our rating
0   Last week Rep. Louie Gohmert told Chris Salced...            FALSE
1   WHATEVER drama plays out when Republicans meet...             TRUE
2   With merchants in Democrat-run cities boarding...            FALSE
3   State Dining Room  4:22 P.M. EST  THE PRESIDEN...  partially false
4   (Natural News) A 2012 study published in the j...            FALSE
5   Latest Breaking News: Martial Law Imminent  Ge...            FALSE
6   Smoking could be considered a distraction unde...            FALSE
7   Gov. Scott Walker said Friday his presidential...  partially false
8   Please Share This Story!                  Dr. ...  partially false
9   We knew that Joe Biden’s vice-presidential pic...            FALSE
10  This article has been updated to include more ...            FALSE
11  “America will never be a socialist country,” P...             TRUE
12  (WISCONSIN) — Trump’s failed COVID-19 response...  partially false
13  Va

The target for our classification model is in the column ‘our rating’.To get a view of ratings , we can use ‘Counter’ from the collections module:

In [None]:
from collections import Counter
print(Counter(data['our rating'].values))

Counter({'FALSE': 21, 'partially false': 18, 'TRUE': 11})


Next we want to balance our data set such that we have an equal number of ‘FALSE’ , ‘partially false’ and ‘TRUE’ types. We also should randomly shuffle the targets:

In [None]:
data_fake = data[data['our rating'] == 'FALSE'] 
data_true = data[data['our rating'] == 'TRUE']
data_p_fake =data[data['our rating'] == 'partially false']

data_fake = data_fake.sample(n=len(data_true))
data_p_fake = data_p_fake.sample(n=len(data_true))
data = (data_fake.append(data_true)).append(data_p_fake)
data = data.sample(frac=1, random_state = 24).reset_index(drop=True)
print(Counter(data['our rating'].values))

Counter({'FALSE': 11, 'TRUE': 11, 'partially false': 11})


In [None]:
print(data)

                                                 text       our rating
0   (Natural News) A 2012 study published in the j...            FALSE
1   U.S. Senator Tammy Baldwin Stands Up for Wisco...             TRUE
2   The deadliest mass shooting in U.S. history wa...             TRUE
3   The panel formed to consider proposed pay rais...             TRUE
4   Trudeau Bought His Mother A Car Using WE Chari...            FALSE
5   With merchants in Democrat-run cities boarding...            FALSE
6   Trump confirms this was a bombing, not an acci...            FALSE
7   This really should be one of the biggest publi...  partially false
8   “America will never be a socialist country,” P...             TRUE
9   Democratic Sen. Kamala Harris of California wa...  partially false
10  Gov. Scott Walker said Friday his presidential...  partially false
11  WASHINGTON, D.C. — Ohio Treasurer Josh Mandel,...             TRUE
12  WHEELING — At least $53 million a week in ille...             TRUE
13  AL

Next we want to format the data such that it can be used as input into our BERT model. We split our data into training and testing sets:

In [None]:
train_data = data.head(16)
test_data = data.tail(16)

We generate a list of dictionaries with ‘text’ and ‘our rating’ keys:

In [None]:
train_data = [{'text': text, 'our rating': rating_data } for text in list(train_data['text']) for rating_data in list(train_data['our rating'])]
test_data = [{'text': text, 'our rating': rating_data } for text in list(test_data['text']) for rating_data in list(test_data['our rating'])]
print(train_data)



Generate a list of tuples from the list of dictionaries :

In [None]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['our rating']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['our rating']), test_data)))

Generate tokens and token ids:

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))
train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")

Finally, generate a boolean array based on the value of ‘type’ for our testing and training sets:

In [None]:
train_y = np.array(train_labels) == 'FALSE'
test_y = np.array(test_labels) == 'FALSE'

4. **MODEL BUILDING**<br>
We create our BERT classifier which contains an ‘initialization’ method and a ‘forward’ method that returns token probabilities:

In [None]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

Next we generate training and testing masks:

In [None]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

Generate token tensors for training and testing:

In [None]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1).astype(np.float32)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1).astype(np.float32)).float()

In [None]:
import torch.utils.data

and finally, prepare our data loaders:

In [None]:
BATCH_SIZE = 3
train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

5. **FINE TUNING** <br>
We use the Adam optimizer to minimize the Binary Cross Entropy loss and we train with a batch size of 1 for 1 EPOCHS:

In [None]:
EPOCHS = 1
bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

Epoch:  1
0/85.33333333333333 loss: 0.5098482966423035 
Epoch:  1
1/85.33333333333333 loss: 0.5670263767242432 
Epoch:  1
2/85.33333333333333 loss: 0.5190829932689667 
Epoch:  1
3/85.33333333333333 loss: 0.5843791589140892 
Epoch:  1
4/85.33333333333333 loss: 0.5981767475605011 
Epoch:  1
5/85.33333333333333 loss: 0.6094728857278824 
Epoch:  1
6/85.33333333333333 loss: 0.642731807061604 
Epoch:  1
7/85.33333333333333 loss: 0.6114304214715958 
Epoch:  1
8/85.33333333333333 loss: 0.5911671982871162 
Epoch:  1
9/85.33333333333333 loss: 0.6213996052742005 
Epoch:  1
10/85.33333333333333 loss: 0.641489261930639 
Epoch:  1
11/85.33333333333333 loss: 0.6232424055536588 
Epoch:  1
12/85.33333333333333 loss: 0.6065066777742826 
Epoch:  1
13/85.33333333333333 loss: 0.5971454722540719 
Epoch:  1
14/85.33333333333333 loss: 0.580377874771754 
Epoch:  1
15/85.33333333333333 loss: 0.5856477748602629 
Epoch:  1
16/85.33333333333333 loss: 0.5766964432071237 
Epoch:  1
17/85.33333333333333 loss: 0.56430

And we evaluate our model:

In [None]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

       False       0.69      1.00      0.81       176
        True       0.00      0.00      0.00        80

   micro avg       0.69      0.69      0.69       256
   macro avg       0.34      0.50      0.41       256
weighted avg       0.47      0.69      0.56       256



  'precision', 'predicted', average, warn_for)
