In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as functional
import matplotlib.pyplot as plt
from transformers import BertForSequenceClassification, AdamW, BertConfig
import gc
from transformers import BertModel
from sklearn.metrics import roc_auc_score,f1_score
import time
import datetime
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = r'Q1\Data'
train_data_path = os.path.join(data_path, 'Train.csv')
val_data_path = os.path.join(data_path, 'Val.csv')

In [3]:
train = pd.read_csv(train_data_path)
val = pd.read_csv(val_data_path)

In [4]:
train["label"] = train["label"].map({"real": 1, "fake": 0})
val["label"] = val["label"].map({"real": 1, "fake": 0})

In [5]:
data = pd.concat([train, val], axis=0, ignore_index=True).drop(["id"], axis=1)
data

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,1
1,States reported 1121 deaths a small rise from ...,1
2,Politically Correct Woman (Almost) Uses Pandem...,0
3,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,Populous states can generate large case counts...,1
...,...,...
8555,Donald Trump wrongly claimed that New Zealand ...,0
8556,Current understanding is #COVID19 spreads most...,1
8557,Nothing screams “I am sat around doing fuck al...,0
8558,Birx says COVID-19 outbreak not under control ...,0


In [6]:
from transformers import BertTokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [7]:
tweets = data.tweet.values
labels = data.label.values

In [8]:
import re
from string import punctuation
def preprocess(data):
    #remove url and hashtag
    for i in range(data.shape[0]):
        text=data[i].lower()
        text1=''.join([word+" " for word in text.split()])
        data[i]=text1
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    space_pattern = '\s+'

    for i in range(data.shape[0]):
        text_string = data[i]
        parsed_text = re.sub(hashtag_regex, '', text_string)
        parsed_text = re.sub(giant_url_regex, '', parsed_text)
        parsed_text = re.sub(mention_regex, '', parsed_text) 
        #remove punctuation
        parsed_text = re.sub(r"[{}]+".format(punctuation), '', parsed_text) 
        parsed_text = re.sub(space_pattern, ' ', parsed_text)
        data[i] = parsed_text
    return data
tweets = preprocess(tweets)
print(tweets)

['the cdc currently reports 99031 deaths in general the discrepancies in death counts between different sources are small and explicable the death toll stands at roughly 100000 people today '
 'states reported 1121 deaths a small rise from last tuesday southern states reported 640 of those deaths '
 'politically correct woman almost uses pandemic as excuse not to reuse plastic bag '
 ...
 'nothing screams “i am sat around doing fuck all during lockdown” quite like confident assumption that other people are sat around doing fuck all during lockdown '
 'birx says covid19 outbreak not under control because ‘people are on the move’ '
 'another 4422 new coronavirus cases have been confirmed in the uk the highest daily number since 8 may its up from 4322 new cases reported on friday and the overall total nationwide now stands at 385936 read the latest here ']


In [9]:
# Print the original sentence.
print(' Original: ', tweets[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(tweets[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets[0])))

 Original:  the cdc currently reports 99031 deaths in general the discrepancies in death counts between different sources are small and explicable the death toll stands at roughly 100000 people today 
Tokenized:  ['the', 'cdc', 'currently', 'reports', '99', '##0', '##31', 'deaths', 'in', 'general', 'the', 'disc', '##re', '##pan', '##cies', 'in', 'death', 'counts', 'between', 'different', 'sources', 'are', 'small', 'and', 'ex', '##pl', '##ica', '##ble', 'the', 'death', 'toll', 'stands', 'at', 'roughly', '1000', '##00', 'people', 'today']
Token IDs:  [1996, 26629, 2747, 4311, 5585, 2692, 21486, 6677, 1999, 2236, 1996, 5860, 2890, 9739, 9243, 1999, 2331, 9294, 2090, 2367, 4216, 2024, 2235, 1998, 4654, 24759, 5555, 3468, 1996, 2331, 9565, 4832, 2012, 5560, 6694, 8889, 2111, 2651]


In [10]:
max_len = 0
ind = [100,200,300,400,500,512]
for i in ind:
  count = 0
  for tweet in tweets:
      max_len = max(max_len, len(tweet))
      if len(tweet)>i:
        count+=1
  print("Count of sentence length over {} is: ".format(i), count)
print('Max sentence length: ', max_len)

Count of sentence length over 100 is:  5667
Count of sentence length over 200 is:  2327
Count of sentence length over 300 is:  59
Count of sentence length over 400 is:  20
Count of sentence length over 500 is:  13
Count of sentence length over 512 is:  10
Max sentence length:  8672


In [11]:
input_ids = []
attention_masks = []
for tweet in tweets:
    encoded_dict = tokenizer.encode_plus(
                        tweet,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', tweets[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  the cdc currently reports 99031 deaths in general the discrepancies in death counts between different sources are small and explicable the death toll stands at roughly 100000 people today 
Token IDs: tensor([  101,  1996, 26629,  2747,  4311,  5585,  2692, 21486,  6677,  1999,
         2236,  1996,  5860,  2890,  9739,  9243,  1999,  2331,  9294,  2090,
         2367,  4216,  2024,  2235,  1998,  4654, 24759,  5555,  3468,  1996,
         2331,  9565,  4832,  2012,  5560,  6694,  8889,  2111,  2651,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
         

In [12]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size],generator=torch.Generator().manual_seed(42))

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

7,704 training samples
  856 validation samples


In [13]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_dataloader = DataLoader(
            train_dataset,  
            shuffle = True,
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset,
            shuffle = False,
            batch_size = batch_size 
        )

In [14]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [15]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, 
                  eps = 1e-8 
                )
epochs = 4
criterion = nn.CrossEntropyLoss()



In [None]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()
best_accuracy = 0
for epoch_i in range(0, epochs):
    #Training
    print("")
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model.train()
    for step, batch in enumerate(train_dataloader):

        input_ids = batch[0]
        input_mask = batch[1]
        labels = batch[2]

        model.zero_grad()        
        out = model(input_ids, token_type_ids=None, attention_mask=input_mask, labels=labels)
        loss = out[0]
        logits = out[1]
  
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        pred = torch.argmax(logits, dim = 1)
        total_train_accuracy +=  torch.sum(pred == labels).item()
        
    avg_train_accuracy = total_train_accuracy / len(train_dataloader.dataset)
    avg_train_loss = total_train_loss / len(train_dataloader.dataset)            
    print("  Accuracy: {}".format(avg_train_accuracy))
    print("  Training loss: {}".format(avg_train_loss))
        

    # Validation
    print("")
    print("Validation...")
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    y_true = []
    y_pred = []

    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        with torch.no_grad():        
            out = model(input_ids, token_type_ids=None, attention_mask=input_mask,labels=labels)
            loss = out[0]
            logits = out[1]

        total_eval_loss += loss.item()
        pred = torch.argmax(logits, dim = 1)
        total_eval_accuracy += torch.sum(pred == labels).item()
        y_true.append(labels.flatten())
        y_pred.append(pred.flatten())
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)
    print("  Accuracy: {}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader.dataset)
    print("  Validation loss: {}".format(avg_val_loss))
    training_time = format_time(time.time() - t0)
    print()
    
    y_true = torch.cat(y_true).tolist()
    y_pred = torch.cat(y_pred).tolist()
    print("This epoch took: {:}".format(training_time))
    print('roc_auc score: ', roc_auc_score(y_true,y_pred))
    print('F1 score:',f1_score(y_true, y_pred))
    print()

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Train Accur.': avg_train_accuracy,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
        }
    )
    print()

    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy
        best_model = model

print()
print("="*10)
print("Summary")
print("Total time {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Epoch 1 / 4
Training...


In [None]:
PATH1 = os.getcwd()
torch.save(model, PATH1)

In [None]:
class BertLstmClassifier(nn.Module):
    def __init__(self, model_tune):
        super().__init__()
        self.bert = model_tune.bert
        self.lstm = nn.LSTM(input_size = 768, 
                            hidden_size = 768, 
                            num_layers = 1, 
                            batch_first = True, 
                            bidirectional = True)
        self.classifier = nn.Linear(768 * 2, 2)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_output = self.bert(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        out, _ = self.lstm(bert_output[0])
        logits = self.classifier(out[:, 1, :])
        return self.softmax(logits)

In [None]:
# Initializing model
model4 = BertLstmClassifier(the_best_model).cuda()
# set parameters
epochs = 6
learning_rate = 5e-5
optimizer = AdamW(model4.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()
best_accuracy = 0
for epoch_i in range(0, epochs):
    #Training
    print("")
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model4.train()
    for step, batch in enumerate(train_dataloader):

        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)

        model4.zero_grad()        
        out = model4(input_ids = input_ids, attention_mask = input_mask, token_type_ids = None)
        loss = criterion(out, labels)
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model4.parameters(), 1.0)
        optimizer.step()

        pred = torch.argmax(out, dim = 1)
        total_train_accuracy +=  torch.sum(pred == labels).item()
        
    avg_train_accuracy = total_train_accuracy / len(train_dataloader.dataset)
    avg_train_loss = total_train_loss / len(train_dataloader.dataset)            
    print("  Accuracy: {}".format(avg_train_accuracy))
    print("  Training loss: {}".format(avg_train_loss))

    # Validation
    print("")
    print("Validation...")
    model4.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    y_true = []
    y_pred = []

    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        with torch.no_grad():        
            out = model4(input_ids = input_ids, attention_mask = input_mask, token_type_ids = None)
        loss = criterion(out, labels)
        total_eval_loss += loss.item()
        pred = torch.argmax(out, dim = 1)
        total_eval_accuracy += torch.sum(pred == labels).item()
        y_true.append(labels.flatten())
        y_pred.append(pred.flatten())
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)
    print("  Accuracy: {}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader.dataset)
    print("  Validation loss: {}".format(avg_val_loss))
    training_time = format_time(time.time() - t0)
    print("  This epoch took: {:}".format(training_time))
    print()
    y_true = torch.cat(y_true).tolist()
    y_pred = torch.cat(y_pred).tolist()
    print('  roc_auc score: ', roc_auc_score(y_true,y_pred))
    print('  F1 score:',f1_score(y_true, y_pred))


    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Train Accur.': avg_train_accuracy,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
        }
    )

    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy
        best_model = model4

print("===")
print("Summary")
print("Total time {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
print('best acc:',best_accuracy)



In [None]:
PATH4 = os.path.join(os.getcwd(), "lstm.pt")
torch.save(model4, PATH4)

In [None]:
class BertLstmClassifier(nn.Module):
    def __init__(self, model_tune):
        super().__init__()
        self.bert = model_tune.bert
        self.lstm = nn.LSTM(input_size = 768, 
                            hidden_size = 768, 
                            num_layers = 2, 
                            batch_first = True, 
                            bidirectional = True)
        self.classifier = nn.Linear(768 * 2, 2)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_output = self.bert(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        out, _ = self.lstm(bert_output[0])
        logits = self.classifier(out[:, 1, :])
        return self.softmax(logits)

In [None]:
model5 = BertLstmClassifier(the_best_model).cuda()
for param in model5.bert.parameters():
    param.requires_grad = False
# set parameters
epochs = 10
learning_rate = 5e-5
optimizer = AdamW(model5.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()
best_accuracy = 0
for epoch_i in range(0, epochs):
    #Training
    print("")
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model5.train()
    for step, batch in enumerate(train_dataloader):

        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)

        model5.zero_grad()        
        out = model5(input_ids = input_ids, attention_mask = input_mask, token_type_ids = None)
        loss = criterion(out, labels)
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model5.parameters(), 1.0)
        optimizer.step()

        pred = torch.argmax(out, dim = 1)
        total_train_accuracy +=  torch.sum(pred == labels).item()
        
    avg_train_accuracy = total_train_accuracy / len(train_dataloader.dataset)
    avg_train_loss = total_train_loss / len(train_dataloader.dataset)            
    print("  Accuracy: {}".format(avg_train_accuracy))
    print("  Training loss: {}".format(avg_train_loss))
        
    # Validation
    print("")
    print("Validation...")
    model5.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    y_true = []
    y_pred = []

    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        with torch.no_grad():        
            out = model5(input_ids = input_ids, attention_mask = input_mask, token_type_ids = None)
        loss = criterion(out, labels)
        total_eval_loss += loss.item()
        pred = torch.argmax(out, dim = 1)
        total_eval_accuracy += torch.sum(pred == labels).item()
        y_true.append(labels.flatten())
        y_pred.append(pred.flatten())
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)
    print("  Accuracy: {}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader.dataset)
    print("  Validation loss: {}".format(avg_val_loss))
    training_time = format_time(time.time() - t0)
    print("  This epoch took: {:}".format(training_time))
    print()
    y_true = torch.cat(y_true).tolist()
    y_pred = torch.cat(y_pred).tolist()
    print('  roc_auc score: ', roc_auc_score(y_true,y_pred))
    print('  F1 score:',f1_score(y_true, y_pred))


    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Train Accur.': avg_train_accuracy,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
        }
    )

    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy
        best_model = model5

print("===")
print("Summary")
print("Total time {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
print('best acc:',best_accuracy)

In [None]:
PATH5 = os.path.join(os.getcwd(), "lstm2.pt")
torch.save(model5, PATH5)

In [None]:
PATH1 = "/content/drive/Shareddrives/CS682 Project/'lstm2.pt'"
best_model = torch.load(PATH1,map_location='cpu')

In [None]:
model = best_model
#model.to(device)
model.eval()
word_dict = {}
for batch_index, batch in enumerate(val_dataloader):
        input_ids = batch[0]
        attention_mask = batch[1]
        token_type_ids = batch[2]
        labels = batch[3]
        # forward
        with torch.no_grad():
            res = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        # prediction
        output = res[0]
        pred = torch.argmax(output, dim = 1)
        for i in pred:
          if i ==1:
            for j in range(input_ids.size(0)):
              for idx in range(512):
                if input_ids[j,idx].item()!=0:
                  if input_ids[j,idx].item() not in word_dict:
                      word_dict[input_ids[j,idx].item()] = 1
                  else:
                      word_dict[input_ids[j,idx].item()] += 1

In [None]:
dict2 = sorted(word_dict.items(), key=lambda x:x[1], reverse=True)

In [None]:
word_list = []
for i in range(100):
  word_list.append(dict2[i][0])

In [None]:
tokens = tokenizer.convert_ids_to_tokens(word_list)
print(tokens)

In [None]:
#drow word cloud
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS) 
all_text = ""
for i in range(14,len(dict2)):
    for j in range(dict2[i][1]):
      words = tokenizer.convert_ids_to_tokens(dict2[i][0])
      all_text += " " + words

In [None]:
wordcloud = WordCloud(collocations=False, width = 800, height = 300, stopwords = stopwords,background_color ='white',  min_font_size = 10)
wordcloud.generate(all_text)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (8, 3), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()

In [None]:
weights = torch.mean(the_final_model.bert.embeddings.word_embeddings.weight,axis = 1)
weights

In [None]:
sorted, indices = torch.sort(weights,descending = True)
target_list = indices[0:200].tolist()

In [None]:
tokens = tokenizer.convert_ids_to_tokens(target_list)
print(tokens)

In [None]:
target_list = indices[0:200].tolist()

In [None]:
from wordcloud import WordCloud, STOPWORDS
L = []
stopwords = set(STOPWORDS) 
string = set(string.punctuation)
u = stopwords.union(string)
for i in tokens:
  if i not in u:
    L.append(i)

In [None]:
class BertLstmClassifier(nn.Module):
    def __init__(self, model_tune):
        super().__init__()
        self.bert = model_tune.bert
        self.lstm = nn.LSTM(input_size = 768, 
                            hidden_size = 768, 
                            num_layers = 2, 
                            batch_first = True, 
                            bidirectional = True)
        self.classifier = nn.Linear(768 * 2, 2)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_output = self.bert(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        out, _ = self.lstm(bert_output[0])
        logits = self.classifier(out[:, 1, :])
        return self.softmax(logits)

In [None]:
PATH1 = "/content/drive/Shareddrives/CS682 Project/'lstm2.pt'"
the_final_model = torch.load(PATH1,map_location='cpu')

In [None]:

dic = [(1996, 8249),
 (101, 7440),
 (102, 7440),
 (1997, 5349),
 (1999, 4686),
 (2000, 4605),
 (1037, 3765),
 (1998, 3358),
 (2522, 2539),
 (17258, 2406),
 (2003, 2378),
 (2005, 2195),
 (16147, 2080),
 (3572, 2016),
 (2024, 1975),
 (2008, 1649),
 (21887, 1480),
 (2013, 1431),
 (2057, 1407),
 (2015, 1348),
 (2006, 1338),
 (2047, 1275),
 (23350, 1271),
 (2007, 1205),
 (2031, 1204),
 (2004, 1117),
 (2023, 1101),
 (1521, 941),
 (2012, 924),
 (2038, 923),
 (2011, 919),
 (6677, 883),
 (2009, 868),
 (5852, 832),
 (100, 813),
 (2256, 806),
 (2045, 805),
 (2111, 794),
 (2025, 769),
 (2022, 727),
 (2062, 724),
 (2651, 706),
 (2193, 677),
 (4484, 664),
 (2163, 648),
 (2064, 640),
 (2042, 639),
 (2988, 630),
 (2102, 609),
 (2561, 581),
 (2019, 580),
 (2030, 576),
 (2001, 574),
 (2085, 553),
 (2035, 539),
 (1055, 534),
 (2040, 515),
 (2902, 507),
 (2740, 498),
 (2110, 493),
 (23713, 493),
 (1016, 485),
 (2017, 485),
 (2065, 484),
 (5604, 478),
 (2039, 476),
 (2020, 471),
 (1015, 462),
 (2097, 456),
 (2084, 454),
 (2634, 447),
 (2053, 430),
 (2021, 409),
 (2058, 400),
 (5022, 400),
 (2487, 387),
 (2027, 373),
 (2692, 372),
 (2115, 372),
 (2037, 371),
 (2951, 370),
 (2029, 362),
 (2575, 362),
 (3207, 361),
 (2055, 355),
 (2080, 354),
 (3189, 354),
 (6090, 353),
 (3231, 352),
 (4609, 347),
 (2683, 346),
 (7712, 340),
 (2420, 339),
 (1017, 334),
 (2553, 333),
 (2154, 332),
 (2549, 321),
 (17404, 321),
 (2050, 318),
 (2069, 318)]

In [None]:
word_list =['cases', 'corona', 'news', 'deaths', 'tests','today','confirmed','reported','states','total']

In [None]:
ids_list = tokenizer.convert_tokens_to_ids(word_list)
print(ids_list)

In [None]:
ids_list = []
for i in range(len(dic)):
  ids_list.append(dic[i][0])

In [None]:
model = the_final_model.cuda()
print("Validation...")
model.eval()
total_eval_accuracy = 0
total_eval_accuracy1 = 0
total_num = 0
tag = False
for batch in validation_dataloader:
    input_ids = batch[0].to(device)
    input_mask = batch[1].to(device)
    labels = batch[2].to(device)
    input_list = []
    label_list = []
    mask_list = []

    input_list_r = []

    for i in range(input_ids.size(0)):
      for ids in ids_list:
        if ids in input_ids[i]:
          tag = True
      if tag == True:
        input_list.append(input_ids[i])
        label_list.append(labels[i])
        mask_list.append(input_mask[i])
        for ids in ids_list:
          for j in range(len(input_ids[i])):
            if input_ids[i][j] == ids:
              input_ids[i][j] = 0
        input_list_r.append(input_ids[i])
      tag = False
    
    if len(input_list)==0:
      continue

    input_list = torch.stack((input_list)).to(device)
    label_list = torch.stack((label_list)).to(device)
    mask_list = torch.stack((mask_list)).to(device)
    input_list_r = torch.stack((input_list_r)).to(device)

    with torch.no_grad():        
      out = model(input_ids = input_list, attention_mask = mask_list, token_type_ids = None)
    pred = torch.argmax(out, dim = 1)
    total_eval_accuracy += torch.sum(pred == label_list).item()

    with torch.no_grad():        
      out1 = model(input_ids = input_list_r, attention_mask = mask_list, token_type_ids = None)
    pred1 = torch.argmax(out1, dim = 1)
    total_eval_accuracy1 += torch.sum(pred1 == label_list).item()

    total_num += len(input_list)
        
avg_val_accuracy = total_eval_accuracy / total_num
avg_val_accuracy1 = total_eval_accuracy1 / total_num
print("  Accuracy: {}".format(avg_val_accuracy))
print("  Removing Accuracy: {}".format(avg_val_accuracy1))