In [None]:
!pip install torch torchvision transformers
#load pretrained bert base model
from transformers import BertModel
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import os
import re
import math
from sklearn.metrics import f1_score,  precision_score, recall_score
import numpy as np 
import time
import xgboost


bert_model = BertModel.from_pretrained('bert-base-uncased')
#load BERT's WordPiece tokenisation model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class SSTDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_pickle(filename)

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        tweet_text = self.df.loc[index, 'text']
        reply_text = self.df.loc[index, 'reply_text']
        followers_count = self.df.loc[index, 'followers_count']
        if 'label' in self.df.columns:
          label = self.df.loc[index, 'label']
        #other_features = np.array([self.df.loc[index, ['followers_count']]])
        other_features = torch.tensor(self.df.loc[index, ['followers_count', 'friends_count', 'listed_count', 'verified','is_reply','favourites_count','retweet_count','favorite_count', 'reply_avg_sent']])
        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(tweet_text+' '+reply_text) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()
        if 'label' in self.df.columns:
          return tokens_ids_tensor, attn_mask,other_features, label
        else:
          return tokens_ids_tensor, attn_mask,other_features

In [None]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = SSTDataset(filename = './train_df.pkl', maxlen = 256)
dev_set = SSTDataset(filename = './dev_df.pkl', maxlen = 256)
test_set = SSTDataset(filename = './test_df.pkl', maxlen = 256)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 32, num_workers = 2)
dev_loader = DataLoader(dev_set, batch_size = 32, num_workers = 2)
test_loader = DataLoader(test_set, batch_size = 32, num_workers = 2)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [None]:

class RumorDetector(nn.8):

    def __init__(self):
        super(RumorDetector, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)
        #self.cls_layer = nn.Linear(1033, 1)

    def forward(self, seq, attn_masks, other_features):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks)
        cont_reps = outputs.last_hidden_state
        #Obtaining the representation of [CLS] head (the first token)
        #print(cont_reps[:, 0].shape)
        #print(other_features.shape)
        #cls_rep = torch.cat([cont_reps[:, 0], other_features],1).float()
        cls_rep = cont_reps[:, 0]
        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [None]:
gpu = 0 #gpu ID

net = RumorDetector()
net.cuda(gpu) #Enable gpu support for the model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RumorDetector(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [None]:
criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [None]:

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_f1 = 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (seq, attn_masks, other_features, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, other_features, labels = seq.cuda(gpu), attn_masks.cuda(gpu), other_features.cuda(gpu), labels.cuda(gpu)
            #Obtaining the logits from the model
            logits = net(seq, attn_masks, other_features)
            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())
            #Backpropagating the gradients
            loss.backward()
            #Optimization step
            opti.step()
              
            if it % 10 == 0:
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        dev_acc, dev_loss, f1, p_score, r_score = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}, Recall: {}, Precision: {} and F1 Score: {}".format(ep, dev_acc, dev_loss, r_score, p_score, f1))
        if f1 > best_f1:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_f1, f1))
            best_f1 = f1
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0
    targets = []
    outputs = []

    with torch.no_grad():
        for seq, attn_masks, other_features, labels in dataloader:
            seq, attn_masks, other_features, labels = seq.cuda(gpu), attn_masks.cuda(gpu),other_features.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks, other_features)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1
            outputs.append(logits.detach().cpu().numpy())
            targets.append(labels.cpu().numpy())
        outputs = np.concatenate(outputs)
        targets = np.concatenate(targets)
        f1 = f1_score(targets,[1 if sigmoid(x)>0.5 else 0 for x in outputs])
        p_score = precision_score(targets,[1 if sigmoid(x)>0.5 else 0 for x in outputs])
        r_score = recall_score(targets,[1 if sigmoid(x)>0.5 else 0 for x in outputs])
    return mean_acc / count, mean_loss / count, f1, p_score, r_score

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

def predict(model, dataloader, gpu):
    model.eval()

    preds = []
    with torch.no_grad():
      for seq, attn_masks, other_features in dataloader:
        seq, attn_masks, other_features = seq.cuda(gpu), attn_masks.cuda(gpu), other_features.cuda(gpu)
        logits = model(seq, attn_masks, other_features)
        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(sigmoid(logit))

    return preds

def extract_embedding(model, dataloader, gpu):
    model.eval()

    preds = []
    with torch.no_grad():
      for seq, attn_masks, other_features, label in dataloader:
        seq, attn_masks, other_features = seq.cuda(gpu), attn_masks.cuda(gpu), other_features.cuda(gpu)
        logits = model(seq, attn_masks, other_features)
        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(sigmoid(logit))

    return preds

In [None]:
num_epoch = 5

for x in os.listdir('.'):
  if x.startswith('sstcls'):
    os.remove(x)

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.6569644212722778; Accuracy: 0.65625; Time taken (s): 3.5441129207611084
Iteration 10 of epoch 0 complete. Loss: 0.4777846038341522; Accuracy: 0.78125; Time taken (s): 25.867339611053467
Iteration 20 of epoch 0 complete. Loss: 0.22273902595043182; Accuracy: 0.9375; Time taken (s): 25.87035298347473
Iteration 30 of epoch 0 complete. Loss: 0.36775755882263184; Accuracy: 0.75; Time taken (s): 25.81198763847351
Iteration 40 of epoch 0 complete. Loss: 0.3795575499534607; Accuracy: 0.875; Time taken (s): 25.786431074142456
Epoch 0 complete! Development Accuracy: 0.810735285282135; Development Loss: 0.36984142310479107, Recall: 0.5826086956521739, Precision: 0.5583333333333333 and F1 Score: 0.5702127659574469
Best development accuracy improved from 0 to 0.810735285282135, saving model...
Iteration 0 of epoch 1 complete. Loss: 0.3734000325202942; Accuracy: 0.875; Time taken (s): 41.2684109210968
Iteration 10 of epoch 1 complete. Loss: 0.2897026836872101;

In [None]:

latest_ep = max([int(re.match(r'sstcls_([0-9]).dat', x)[1]) for x in os.listdir('.') if x.startswith('sstcls')])
net.load_state_dict(torch.load(f'./sstcls_{latest_ep}.dat'))
p = predict(net, test_loader, gpu)
df = pd.DataFrame([1 if x>0.5 else 0 for x in p], columns =['Predicted'])
df.to_csv('test_Submission.csv', index_label='Id')


In [None]:
train_df = pd.read_pickle('./train_df.pkl')
train_embed = extract_embedding(net, train_loader, gpu)
train_df['embed'] = train_embed

dev_df = pd.read_pickle('./dev_df.pkl')
dev_embed = extract_embedding(net, dev_loader, gpu)
dev_df['embed'] = dev_embed

In [None]:
train_df[['followers_count', 'friends_count', 'listed_count', 'verified','is_reply','favourites_count','retweet_count','favorite_count', 'reply_avg_sent','train_embed']]

Unnamed: 0,followers_count,friends_count,listed_count,verified,is_reply,favourites_count,retweet_count,favorite_count,reply_avg_sent,train_embed
0,-0.329117,-0.046457,-0.376736,0,1,0.239585,-0.349946,-0.263323,-0.368300,0.000924
1,0.065073,-0.066977,0.276132,1,0,-0.398676,0.158194,-0.154328,-1.730077,0.901331
2,-0.329092,-0.066042,-0.376736,0,0,0.075617,-0.347184,-0.253414,-0.489189,0.001571
3,2.081211,-0.064422,3.574830,1,0,-0.383882,-0.018550,-0.206348,-0.501423,0.019478
4,-0.328566,-0.007887,-0.376546,0,0,0.008665,-0.347184,-0.248460,-0.035306,0.003655
...,...,...,...,...,...,...,...,...,...,...
1561,-0.329089,-0.070369,-0.376736,0,1,1.228920,-0.349946,-0.263323,0.045623,0.001561
1562,-0.311690,0.740538,-0.336669,1,0,0.739320,-0.140062,-0.124602,-1.097507,0.994374
1563,1.800456,-0.072477,1.689071,1,0,-0.404951,-0.048928,0.113206,-0.295895,0.968023
1564,0.778721,-0.053339,1.217261,1,0,-0.390956,0.138863,-0.107261,-1.445863,0.014335


In [None]:
#setting grid of selected parameters for iteration
params = {'gamma': [0,0.1,0.4,0.8,1.6],
              'learning_rate': [0.01, 0.03, 0.1, 0.15, 0.5],
              'max_depth': [5,7,10],
              'n_estimators': [50,100,150],
              'reg_alpha': [0,0.1,0.4,0.8,1.6],
              'reg_lambda': [0,0.1,0.4,0.8,1.6]}


from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV 

xgb_model = XGBClassifier()

model = GridSearchCV(estimator = xgb_model,param_grid = params)
model.fit(train_df[['followers_count', 'friends_count', 'listed_count', 'verified','is_reply','favourites_count','retweet_count','favorite_count', 'reply_avg_sent','embed']], train_df.label)
model.best_params_

In [None]:
import xgboost as xgb
clf = xgb.XGBClassifier(max_depth=200, n_estimators=400, subsample=1, learning_rate=0.07, reg_lambda=0.1, reg_alpha=0.1, gamma=1)
clf.fit(train_df[['followers_count', 'friends_count', 'listed_count', 'verified','is_reply','favourites_count','retweet_count','favorite_count', 'reply_avg_sent','embed']], train_df.label)


XGBClassifier(gamma=1, learning_rate=0.07, max_depth=200, n_estimators=400,
              reg_alpha=0.1, reg_lambda=0.1)

In [None]:

predictions = clf.predict(dev_df[['followers_count', 'friends_count', 'listed_count', 'verified','is_reply','favourites_count','retweet_count','favorite_count', 'reply_avg_sent','embed']])
print ("Training set f1_score :", np.round(f1_score(dev_df.label, predictions),5))

Training set f1_score : 0.88496
