In [7]:
import pandas as pd
import os
train_data=pd.read_csv('propaganda_train.tsv',sep='\t')
test_data=pd.read_csv('propaganda_val.tsv',sep='\t')
print(train_data)
print(train_data)
training_df=pd.DataFrame(train_data,columns=["tagged_in_context","label"])
testing_df=pd.DataFrame(test_data,columns=["tagged_in_context","label"])

print(training_df)
labellist=sorted(list(set(training_df['label'].unique()).union(set(testing_df['label'].unique()))))

labels={label:i for i,label in enumerate(labellist)}
labels
reverse_index={value:key for (key,value)in labels.items()}
training_df.head()

                      label                                  tagged_in_context
0            not_propaganda         No, <BOS> he <EOS> will not be confirmed. 
1            not_propaganda  This declassification effort <BOS> won’t make ...
2               flag_waving  The Obama administration misled the <BOS> Amer...
3            not_propaganda  “It looks like we’re capturing the demise of t...
4            not_propaganda           <BOS> Location: Westerville, Ohio <EOS> 
...                     ...                                                ...
2409         not_propaganda  <BOS> We support and appreciate <EOS> your bus...
2410         not_propaganda  International Atomic Energy Agency (IAEA) Dire...
2411         not_propaganda  What has been done: there has been work on for...
2412         not_propaganda  This is <BOS> the law of gradualness not the g...
2413  name_calling,labeling  In it, Jews are described as: “arrogant,” “jea...

[2414 rows x 2 columns]
                      label

Unnamed: 0,tagged_in_context,label
0,"No, <BOS> he <EOS> will not be confirmed.",not_propaganda
1,This declassification effort <BOS> won’t make ...,not_propaganda
2,The Obama administration misled the <BOS> Amer...,flag_waving
3,“It looks like we’re capturing the demise of t...,not_propaganda
4,"<BOS> Location: Westerville, Ohio <EOS>",not_propaganda


In [8]:
import torch
import numpy as np
from transformers import BertTokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

class Dataset(torch.utils.data.Dataset):

    def __init__(self,df,column='tagged_in_context'):
        self.labels=[labels[label] for label in df['label']]
        self.texts=[tokenizer(text.lower(),padding='max_length',max_length=512,truncation=True,return_tensors="pt") for text in df[column]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self,idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self,idx):
        return self.texts[idx]

    def __getitem__(self,idx):
        batch_texts=self.get_batch_texts(idx)
        batch_y=self.get_batch_labels(idx)

        return batch_texts,batch_y


train_data=Dataset(training_df)
test_data=Dataset(testing_df)
train_data[0]
train_data.texts[0]
my_input = train_data.texts[0].input_ids
my_input
my_tokens=tokenizer.convert_ids_to_tokens(my_input[0])
my_tokens

['[CLS]',
 'no',
 ',',
 '<',
 'bo',
 '##s',
 '>',
 'he',
 '<',
 'e',
 '##os',
 '>',
 'will',
 'not',
 'be',
 'confirmed',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '

In [9]:
import torch
use_cuda=torch.cuda.is_available()
if use_cuda:
  print("GPU acceleration enabled")
else:
  print("GPU acceleration NOT enabled.  If using Colab, have you changed the runtype type and selected GPU as the hardware accelerator?")
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()
# get number of GPUs available
torch.cuda.device_count() # returns 1 in my case



GPU acceleration NOT enabled.  If using Colab, have you changed the runtype type and selected GPU as the hardware accelerator?
Using device: cpu



0

In [10]:
def prepare_inputs(input1,label,device):
  label=label.to(device)
  mask=input1['attention_mask'].to(device)
  input_id=input1['input_ids'].squeeze(1).to(device)
  return (input_id,mask,label)

In [11]:
from transformers import BertModel
train_dataloader=torch.utils.data.DataLoader(train_data,batch_size=2,shuffle=True)
bert=BertModel.from_pretrained('bert-base-uncased')
for train_input,train_label in train_dataloader:
    input_id,mask,label=prepare_inputs(train_input,train_label,device)
    output=bert(input_ids=input_id,attention_mask=mask,return_dict=False)
    break

print(input_id,mask,label)
print(len(output))
output[1]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[ 101, 2009, 2001,  ...,    0,    0,    0],
        [ 101, 2002, 3728,  ...,    0,    0,    0]]) tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]]) tensor([7, 7], dtype=torch.int32)
2


tensor([[-0.7184, -0.4232, -0.9672,  ..., -0.8875, -0.6072,  0.5026],
        [-0.9450, -0.6086, -0.9665,  ..., -0.8290, -0.6635,  0.8852]],
       grad_fn=<TanhBackward0>)

In [12]:
#now we need to put a simple classification layer on top of BERT

from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self,dropout=0.5,num_classes=2):

        super(BertClassifier,self).__init__()

        self.bert=BertModel.from_pretrained('bert-base-uncased')
        self.dropout=nn.Dropout(dropout)
        self.linear=nn.Linear(768,num_classes)
        self.relu=nn.ReLU()

    def forward(self,input_id,mask):

        last_hidden_layer,pooled_output = self.bert(input_ids=input_id,attention_mask=mask,return_dict=False)
        dropout_output=self.dropout(pooled_output)
        linear_output=self.linear(dropout_output)
        final_layer=self.relu(linear_output)

        return final_layer

In [13]:
#we now need a training loop

from torch.optim import Adam
from tqdm import tqdm



def train(model, train_data,val_data,learning_rate,epochs):

    train_dataloader=torch.utils.data.DataLoader(train_data,batch_size=2,shuffle=True)
    val_dataloader=torch.utils.data.DataLoader(test_data,batch_size=2)

    use_cuda=torch.cuda.is_available()
    device=torch.device("cuda" if use_cuda else "cpu")

    criterion=nn.CrossEntropyLoss()
    optimizer=Adam(model.parameters(),lr=learning_rate)

    if use_cuda:
        model=model.cuda()
        criterion=criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train=0
        total_loss_train=0
        model.train()
        for train_input,train_label in tqdm(train_dataloader):

            input_id,mask, train_label=prepare_inputs(train_input,train_label,device)

            output=model(input_id,mask)

            batch_loss=criterion(output,train_label.long())
            total_loss_train +=batch_loss.item()

            acc=(output.argmax(dim=1)==train_label).sum().item()
            total_acc_train+=acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val=0
        total_loss_val=0
        model.eval()
        with torch.no_grad():
            for val_input,val_label in val_dataloader:

                input_id,mask, val_label=prepare_inputs(val_input,val_label,device)

                output=model(input_id,mask)

                batch_loss=criterion(output,val_label.long())

                total_loss_val+=batch_loss.item()

                acc=(output.argmax(dim=1)==val_label).sum().item()
                total_acc_val+=acc

        print(f'Epochs: {epoch_num+1} | Train Loss: {total_loss_train / len(train_data):.3f} | Train Accuracy: {total_acc_train/len(train_data):.3f}')
        print(f'Val loss: {total_loss_val/len(val_data):.3f} | Val Accuracy: {total_acc_val / len(val_data):.3f}')


In [14]:
EPOCHS=1
model=BertClassifier(num_classes=len(labels.keys()))
LR=1e-6

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
train(model,train_data,test_data,LR,EPOCHS)

 30%|██▉       | 362/1207 [34:50<1:21:20,  5.78s/it]


KeyboardInterrupt: 

In [None]:
output_dir="bert-base-uncased-bookclassifier"
torch.save(model,output_dir)

In [None]:
input_dir="bert-base-uncased-bookclassifier"
complete_model=torch.load(input_dir)

In [None]:
batchsize=2
def evaluate(model,test_dataset):
    model.eval()
    test_dataloader=torch.utils.data.DataLoader(test_dataset,batch_size=batchsize)

    use_cuda=torch.cuda.is_available()
    device=torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model=model.cuda()

    total_acc_test=0
    with torch.no_grad():
        count=0
        predictions=[]
        for test_input,test_label in tqdm(test_dataloader):
            count+=batchsize
            test_label=test_label.to(device)
            mask=test_input['attention_mask'].to(device)
            input_id=test_input['input_ids'].squeeze(1).to(device)
            output=model(input_id,mask)
            #print(output.argmax(dim=1),test_label)
            predictions.append(output.argmax(dim=1))  #save the prediction for further analysis
            acc=(output.argmax(dim=1)==test_label).sum().item()

            total_acc_test+=acc
            if count%100==0:
                print(f'Accuracy so far = {total_acc_test/count: .3f}')

    print(f'Test accuracy: {total_acc_test/len(test_dataset): .3f}')
    return predictions

In [None]:
predictions=evaluate(model, test_data)

In [None]:
flattened=[]
for batch in predictions:
    for pred in batch:
        flattened.append(reverse_index[pred.item()])
testing_df['prediction']=flattened
testing_df.head(50)

In [None]:
all_labels=testing_df['label']
all_predictions=testing_df['prediction']

In [None]:
tp={}
fp={}
fn={}
tn={}

for label1,pred1 in zip(all_labels,all_predictions):
    for label in labels.keys():
        if label1==label:
            if pred1==label:
                tp[label]=tp.get(label,0)+1
            else:
                fn[label]=fn.get(label,0)+1

        else:
            if pred1==label:
                fp[label]=fp.get(label,0)+1
            else:
                tn[label]=tn.get(label,0)+1



precision={label:value/(value+fp.get(label,0)) for label,value in tp.items()}
recall={label:value/(value+fn.get(label,0)) for label,value in tp.items()}
f1={label:(2*value*recall.get(label,0))/(value+recall.get(label,0)) for label,value in precision.items()}

In [None]:
precision
recall
f1

In [None]:
testing_df.loc[47, "text"]
len(testing_df.loc[47, "text"])
len(testing_df.loc[1, "text"])
texts = testing_df["text"]
lengths = [len(text) for text in texts]
print(np.mean(lengths))