In [1]:
from __future__ import print_function, division
import os
import torch
import random
from torchvision.transforms import ToTensor, ToPILImage
import zipfile
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler, Sampler, Subset
from torchvision import transforms, utils
import torch.nn as nn
from tqdm import tqdm
from typing import Iterator, List, Callable, Tuple
from functools import partial
from math import *
from IPython.display import HTML
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime
import time
import seaborn as sns
import random

from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report, roc_curve, auc, precision_recall_curve

from torch.optim.lr_scheduler import StepLR
from transformers import BatchEncoding

from matplotlib import rc, cm
rc('animation', html='jshtml')

import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import matplotlib.animation as animation
%matplotlib notebook

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('./train_data.csv')
val_df = pd.read_csv('./val_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [3]:
train_df['content'] = train_df['title'] + ' ' + train_df['text']
val_df['content'] = val_df['title'] + ' ' + val_df['text']
test_df['content'] = test_df['title'] + ' ' + test_df['text']

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
# torch.serialization.add_safe_globals([BatchEncoding])

if not os.path.exists("./bert_train_encodings.pt"):
    train_encodings = tokenizer(train_df['content'].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
    val_encodings = tokenizer(val_df['content'].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
    test_encodings = tokenizer(test_df['content'].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
    torch.save(train_encodings, "./bert_train_encodings.pt")
    torch.save(val_encodings, "./bert_val_encodings.pt")
    torch.save(test_encodings, "./bert_test_encodings.pt")

else:
    train_encodings = torch.load("./bert_train_encodings.pt", weights_only= False)
    val_encodings = torch.load("./bert_val_encodings.pt", weights_only= False)
    test_encodings = torch.load("./bert_test_encodings.pt", weights_only= False)

In [6]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        #item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} # input_ids, attention_mask are the most important
        item = {}
        item['input_ids'] =  torch.tensor(self.encodings["input_ids"][idx])
        item['attention_mask'] = torch.tensor(self.encodings["attention_mask"][idx])
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_df['label'].tolist())
val_dataset = FakeNewsDataset(val_encodings, val_df['label'].tolist())
test_dataset = FakeNewsDataset(test_encodings, test_df['label'].tolist())

random_indices_train = random.sample(range(len(train_dataset)), 4000) # 32000
random_indices_val = random.sample(range(len(val_dataset)), 500) # 4000

train_subset = Subset(train_dataset, random_indices_train)
val_subset = Subset(val_dataset, random_indices_val)
# test_subset = Subset(test_dataset, range(500))

train_loader = DataLoader(train_subset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [7]:
from transformers import BertForSequenceClassification
sample_indices = random.sample(range(len(test_dataset)), 10)
samples = [test_dataset[i] for i in sample_indices]
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('best_model_init_num_epochs-5lr-1e-05step_size-Nonegamma-None.pth', map_location=device))
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from captum.attr import IntegratedGradients, visualization

def compute_bert_outputs(model_bert, embedding_output, attention_mask):
    extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

    extended_attention_mask = extended_attention_mask.to(dtype=next(model_bert.parameters()).dtype)
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

    encoder_outputs = model_bert.encoder(embedding_output,
                                         extended_attention_mask)
    sequence_output = encoder_outputs[0]
    pooled_output = model_bert.pooler(sequence_output)
    outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]
    return outputs  


class BertModelWrapper(nn.Module):
    
    def __init__(self, model):
        super(BertModelWrapper, self).__init__()
        self.model = model
        
    def forward(self, embeddings, attention_mask=None):
        if attention_mask is None:
            attention_mask = torch.ones(embeddings.shape[0], embeddings.shape[1]).to(embeddings)
        outputs = compute_bert_outputs(self.model.bert, embeddings, attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.model.dropout(pooled_output)
        logits = self.model.classifier(pooled_output)
        return torch.softmax(logits, dim=1)[:, 1].unsqueeze(1)

def add_attributions_to_visualizer(attributions, tokens, pred, pred_ind, label, delta, vis_data_records):
    attributions_sum = attributions[0].sum(dim=2).squeeze(0)
    
    attributions_norm = attributions_sum / torch.norm(attributions_sum)
    
    attributions_list = attributions_norm.cpu().detach().numpy().tolist()
    
    vis_data_records.append(visualization.VisualizationDataRecord(
        attributions_list,
        pred,
        pred_ind,
        label,
        "label",
        attributions_norm.sum().item(),
        tokens[:len(attributions_list)],
        delta))
    
    
bert_model_wrapper = BertModelWrapper(model)
ig = IntegratedGradients(bert_model_wrapper)
bert_model_wrapper.eval()
vis_data_records_ig = []
for i, sample in enumerate(samples):
    if i == 6:
        input_ids = sample['input_ids'].to(device).unsqueeze(0)
        attention_mask = sample['attention_mask'].to(device).unsqueeze(0)
        true_label = sample['labels'].item()
        original_text = test_df.iloc[sample_indices[i]]['content']

        bert_model_wrapper.zero_grad()
        input_embedding = bert_model_wrapper.model.bert.embeddings(input_ids)
        pred = bert_model_wrapper(input_embedding, attention_mask).item()
        pred_ind = round(pred)
        attributions_ig, delta = ig.attribute((input_embedding, attention_mask), n_steps=50, return_convergence_delta=True)
        print(f"prediction: {pred_ind} ({pred}), label: {true_label}, delta: {delta}")

        tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy().tolist())    
        add_attributions_to_visualizer(attributions_ig, tokens, pred, pred_ind, true_label, delta, vis_data_records_ig)
visualization.visualize_text(vis_data_records_ig)

prediction: 0 (0.0011901400284841657), label: 0, delta: tensor([0.4477])


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,0 (0.00),label,1.65,"[CLS] russia confirms putin - trump talk on joint cyber unit moscow - russia confirmed on monday that donald trump and vladimir putin had discussed forming a joint russian - u . s . group on cyber security , an idea that has provoked up ##ro ##ar in washington , but said it was only a tentative proposal . trump said on twitter early on sunday the two leaders discussed forming “ an imp ##ene ##tra ##ble cyber security unit ” when they met at the hamburg g ##20 summit . the idea was greeted with inc ##red ##uli ##ty by some senior republicans who said moscow could not be trusted - and the u . s . president later in the day t ##wee ##ted that he did not think it could happen . “ the heads of state did talk about such a possibility , ” k ##rem ##lin spokesman dmitry pe ##sko ##v told a conference call with reporters on monday . “ nothing was promised to each other , ” he added . “ what is positive , they stated their readiness to work in this direction . ” the conversation had been “ about the possibility of forming such a group ” , he said . “ whether it will be created or not , time will show . ” sv ##et ##lana lukas ##h , a russian official who was at the hamburg summit , told a news conference earlier on monday putin and trump ’ s discussion of cyber security had taken up 40 minutes of their meeting , which lasted more than two hours . “ president putin proposed creating a working group , ” she said . “ this does not mean it should start working immediately , virtually tomorrow . ” she added : “ the main thing is , this matter was discussed , the united states is ready to consider cooperation in this sphere , and then we will see . “ maybe this will be a working group , maybe this will be cooperation on the floor of the united nations . but in any case , our two countries will need to discuss these questions . this is namely what the presidents agreed upon . ” she said of the landmark talks between the two men in hamburg : “ nobody , except the participants of that meeting , knows how that proposal was formulated and how president trump reacted . ” some media reports may have prematurely assumed that the creation of a joint commission on cyber security was already decided , she said . “ that was a proposal . probably , he ( trump ) is not ready yet for this specific initiative at this stage , ” lukas ##h said . trump ’ s administration has been dogg ##ed by investigations into allegations of russian interference in last year ’ s u . s . election and ties with his campaign . pe ##sko ##v also told reporters on monday the k ##rem [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,0 (0.00),label,1.65,"[CLS] russia confirms putin - trump talk on joint cyber unit moscow - russia confirmed on monday that donald trump and vladimir putin had discussed forming a joint russian - u . s . group on cyber security , an idea that has provoked up ##ro ##ar in washington , but said it was only a tentative proposal . trump said on twitter early on sunday the two leaders discussed forming “ an imp ##ene ##tra ##ble cyber security unit ” when they met at the hamburg g ##20 summit . the idea was greeted with inc ##red ##uli ##ty by some senior republicans who said moscow could not be trusted - and the u . s . president later in the day t ##wee ##ted that he did not think it could happen . “ the heads of state did talk about such a possibility , ” k ##rem ##lin spokesman dmitry pe ##sko ##v told a conference call with reporters on monday . “ nothing was promised to each other , ” he added . “ what is positive , they stated their readiness to work in this direction . ” the conversation had been “ about the possibility of forming such a group ” , he said . “ whether it will be created or not , time will show . ” sv ##et ##lana lukas ##h , a russian official who was at the hamburg summit , told a news conference earlier on monday putin and trump ’ s discussion of cyber security had taken up 40 minutes of their meeting , which lasted more than two hours . “ president putin proposed creating a working group , ” she said . “ this does not mean it should start working immediately , virtually tomorrow . ” she added : “ the main thing is , this matter was discussed , the united states is ready to consider cooperation in this sphere , and then we will see . “ maybe this will be a working group , maybe this will be cooperation on the floor of the united nations . but in any case , our two countries will need to discuss these questions . this is namely what the presidents agreed upon . ” she said of the landmark talks between the two men in hamburg : “ nobody , except the participants of that meeting , knows how that proposal was formulated and how president trump reacted . ” some media reports may have prematurely assumed that the creation of a joint commission on cyber security was already decided , she said . “ that was a proposal . probably , he ( trump ) is not ready yet for this specific initiative at this stage , ” lukas ##h said . trump ’ s administration has been dogg ##ed by investigations into allegations of russian interference in last year ’ s u . s . election and ties with his campaign . pe ##sko ##v also told reporters on monday the k ##rem [SEP]"
,,,,
