In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cp /content/drive/MyDrive/sstcls_4.dat /content/sstcls_4.dat

In [3]:
!pip install torch torchvision transformers
from pandas import DataFrame
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import defaultdict,Counter
from transformers import BertModel
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import os
import re
import math
from sklearn.metrics import f1_score,  precision_score, recall_score
import numpy as np 
import time
from torch.utils.data import DataLoader


Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 62.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.5 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for



In [4]:
class SSTDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_pickle(filename)

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        tweet_text = self.df.loc[index, 'text']
        reply_text = self.df.loc[index, 'reply_text']
        followers_count = self.df.loc[index, 'followers_count']
        if 'label' in self.df.columns:
          label = self.df.loc[index, 'label']
        #other_features = np.array([self.df.loc[index, ['followers_count']]])
        other_features = torch.tensor(self.df.loc[index, ['followers_count', 'friends_count', 'listed_count', 'verified','is_reply','favourites_count','retweet_count','favorite_count', 'reply_avg_sent']])
        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(tweet_text+' '+reply_text) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()
        if 'label' in self.df.columns:
          return tokens_ids_tensor, attn_mask,other_features, label
        else:
          return tokens_ids_tensor, attn_mask,other_features

In [5]:
gpu = 0 #gpu ID

class RumorDetector(nn.Module):

    def __init__(self):
        super(RumorDetector, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(777, 1)
        #self.cls_layer = nn.Linear(1033, 1)

    def forward(self, seq, attn_masks, other_features):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks)
        cont_reps = outputs.last_hidden_state
        #Obtaining the representation of [CLS] head (the first token)
        #print(cont_reps[:, 0].shape)
        #print(other_features.shape)
        cls_rep = torch.cat([cont_reps[:, 0], other_features],1).float()
        #cls_rep = cont_reps[:, 0]
        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits
net = RumorDetector()
net.cuda(gpu) #Enable gpu support for the model

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RumorDetector(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [9]:
def predict(model, dataloader, gpu):
    model.eval()

    preds = []
    with torch.no_grad():
      for seq, attn_masks, other_features in dataloader:
        seq, attn_masks, other_features = seq.cuda(gpu), attn_masks.cuda(gpu), other_features.cuda(gpu)
        logits = model(seq, attn_masks, other_features)
        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(sigmoid(logit))

    return preds

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [10]:
covid_set = SSTDataset(filename = './covid_df.pkl', maxlen = 512)

#Creating intsances of training and development dataloaders
covid_loader = DataLoader(covid_set, batch_size = 16, num_workers = 2)

latest_ep = max([int(re.match(r'sstcls_([0-9]).dat', x)[1]) for x in os.listdir('.') if x.startswith('sstcls')])
net.load_state_dict(torch.load(f'./sstcls_{latest_ep}.dat'))
p = predict(net, covid_loader, gpu)
df = pd.DataFrame([1 if x>0.5 else 0 for x in p], columns =['Predicted'])
df.to_csv('test_Submission.csv', index_label='Id')


In [32]:
covid_pickle= './covid_df.pkl'
covid_df = pd.read_pickle(covid_pickle)
covid_df['predict_label'] = [1 if x>0.5 else 0 for x in p]

In [33]:
covid_df.to_pickle('./covid_df_with_label.pkl')