<a href="https://colab.research.google.com/github/BabeRush/NLP_final_project/blob/main/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training

## Notebook preparation

### Configurations

In [None]:
# Checking high RAM from google
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
# Assigning and checking for GPU usage
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

### Imports & Installations

In [None]:
! pip install transformers
!pip install import-ipynb



In [None]:
# IMPORTS
# General
import warnings
import random
import sys

# Math & Data organization
import numpy as np
import scipy as sp
import pandas as pd
import re
import import_ipynb

# Deep Learning
import transformers
from transformers import AutoModel, BertTokenizerFast , BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AdamW
from sklearn.metrics import classification_report

# Visualization
import seaborn as sns
from tqdm import tqdm
from tabulate import tabulate
import matplotlib.pyplot as plt

# Working with files
from google.colab import files
import pickle
import shutil

## Creating the datasets

In [None]:
# OPTIONAL - delete the cloned folder
shutil.rmtree('/content/NLP_final_project')

In [None]:
# Clone the datasets
# Description: ************TODO-LINK******************
!git clone https://github.com/BabeRush/NLP_final_project.git
shutil.move('/content/NLP_final_project/datasets_util.ipynb', '/content/')

Cloning into 'NLP_final_project'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 54 (delta 21), reused 4 (delta 0), pack-reused 12[K
Receiving objects: 100% (54/54), 41.42 MiB | 15.97 MiB/s, done.
Resolving deltas: 100% (22/22), done.


'/content/datasets_util.ipynb'

In [None]:
df_real = pd.read_csv('/content/NLP_final_project/True.csv',encoding='utf-8')
df_fake = pd.read_csv ('/content/NLP_final_project/Fake.csv',encoding='utf-8')

# reading these tsv files: seperated by tabs, no header
liar_train = pd.read_csv('/content/NLP_final_project/train.tsv', sep='\t',encoding='utf-8', header=None)
liar_test = pd.read_csv('/content/NLP_final_project/test.tsv', sep='\t',encoding='utf-8', header=None)
liar_valid = pd.read_csv('/content/NLP_final_project/valid.tsv', sep='\t',encoding='utf-8', header=None)

# Rename the columns to match the 'df_real' dataset
liar_train.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']
liar_valid.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']
liar_test.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

### Filter missing values (statistics below)

In [None]:
temp_real = len(df_real)
temp_fake = len(df_fake)
temp_li_train = len(liar_train)
temp_li_val = len(liar_valid)
temp_li_test = len(liar_test)

df_real = df_real.dropna()
df_fake = df_fake.dropna()
liar_train = liar_train.dropna()
liar_valid = liar_valid.dropna()
liar_test = liar_test.dropna()

g1 = temp_real - len(df_real)
g2 = temp_fake - len(df_fake)
g3 = temp_li_train - len(liar_train)
g4 = temp_li_val - len(liar_valid)
g5 = temp_li_test - len(liar_test)

print(f'Number of df_real sentences: {len(df_real)}, removed {g1}({(g1/temp_real)*100:.1f}%) instances')
print(f'Number of df_fake sentences: {len(df_fake)}, removed {g2}({(g1/temp_fake)*100:.1f}%) instances')
print(f'Number of liar_train sentences: {len(liar_train)}, removed {g3}({(g3/temp_li_train)*100:.1f}%) instances')
print(f'Number of liar_valid sentences: {len(liar_valid)}, removed {g4}({(g4/temp_li_val)*100:.1f}%) instances')
print(f'Number of liar_test sentences: {len(liar_test)} removed {g5}({(g5/temp_li_test)*100:.1f}%) instances')

Number of df_real sentences: 21417, removed 0(0.0%) instances
Number of df_fake sentences: 23481, removed 0(0.0%) instances
Number of liar_train sentences: 6724, removed 3516(34.3%) instances
Number of liar_valid sentences: 861, removed 423(32.9%) instances
Number of liar_test sentences: 853 removed 414(32.7%) instances


## Datasets combinations

### Initial datasets

In [None]:
# print('Understanding \"df_real\" and \"df_fake\" columns:')
# df_real.head()

In [None]:
# print('Understanding \"LIAR\" columns:')
# liar_train.head()

### First DS

In [None]:
from datasets_util import prepare_dataset_1, prepare_dataset_2
# Preparing the first DS
# No markers:
df_no_markers = prepare_dataset_1(df_real, df_fake, short_markers=False, text=True, title=False, subject=False)
# Short markers:
df_short_markers = prepare_dataset_1(df_real, df_fake, short_markers=True, text=True, title=False, subject=False)


importing Jupyter notebook from datasets_util.ipynb


In [None]:
df_no_markers.head()

Unnamed: 0,text_no_markers,label
0,There s a petition on Change.org going around ...,0
1,LONDON (Reuters) - Prime Minister Theresa May ...,1
2,Hollywood actor James Woods tweeted about Trum...,0
3,(Reuters) - A Turkish-Iranian gold trader on T...,1
4,"WASHINGTON/MARRAKESH, Morocco (Reuters) - Pres...",1


### Second DS

In [None]:
# Only no marker option
df_no_markers_LIAR = prepare_dataset_2(liar_train, liar_valid, liar_test, subject=False, statement=True, speaker=True, context=False)

In [None]:
df_no_markers_LIAR.head()

Unnamed: 0,text_no_markers,label
0,According to the nonpartisan Congressional Bud...,0
1,99.97 percent of the kids live in poverty that...,0
2,Our men and women of the Guard have been force...,0
3,Says people in Africa literally walk two and t...,0
4,Ronald Reagan banned ownership of fully automa...,0


In [None]:
comb_df_no_markers = pd.concat([df_no_markers, df_no_markers_LIAR], ignore_index=True)

### Prepare dataloaders

In [None]:
# Constant parameters
RANDOM_SEED = 42
MAX_LENGTH = 256
BATCH_SIZE = 32
MINI_SAMPLE = 500
SMALL_SAMPLE = 5000
MEDIUM_SAMPLE = 15000
LARGE_SAMPLE = 30000

In [None]:
# def reduce_instances(df, sample_size, seed):
#   return df.sample(n=sample_size, random_state=seed)

# temp_shape_no = comb_df_no_markers.shape
# # NOTE the sample size
# combined_df_no_markers = reduce_instances(comb_df_no_markers, LARGE_SAMPLE, RANDOM_SEED)
# combined_df_no_markers.shape

In [None]:
combined_df_no_markers = comb_df_no_markers

In [None]:
# Train-Temp split
train_text_no_markers, temp_text_no_markers, train_labels_no_markers, temp_labels_no_markers = train_test_split(combined_df_no_markers['text_no_markers'], combined_df_no_markers['label'],
                                                                    random_state=RANDOM_SEED,
                                                                    test_size=0.4,
                                                                    stratify=combined_df_no_markers['label'])
# Validation-Test split
val_text_no_markers, test_text_no_markers, val_labels_no_markers, test_labels_no_markers = train_test_split(temp_text_no_markers, temp_labels_no_markers,
                                                                random_state=RANDOM_SEED,
                                                                test_size=0.5,
                                                                stratify=temp_labels_no_markers)

In [None]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
batch_size = BATCH_SIZE

tokens_train = tokenizer.batch_encode_plus(
    train_text_no_markers.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text_no_markers.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text_no_markers.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)

# Convert lists to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels_no_markers.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels_no_markers.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels_no_markers.tolist())

train_data = TensorDataset(train_seq, train_mask, train_y)    # wrap tensors
train_sampler = RandomSampler(train_data)                     # sampler for sampling the data during training
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
                                                              # dataLoader for train set
val_data = TensorDataset(val_seq, val_mask, val_y)            # wrap tensors
val_sampler = SequentialSampler(val_data)                     # sampler for sampling the data during training
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
                                                              # dataLoader for validation set

In [None]:
# Freezing the parameters and defining trainable BERT structure
for param in bert.parameters():
    param.requires_grad = False    # false here means gradient need not be computed

In [None]:
class BERT_Arch(nn.Module):
    def __init__(self, bert, num_classes=2, hidden_size=768 , dropout_prob=0.1):
      super(BERT_Arch, self).__init__()
      self.bert = bert
      self.dropout = nn.Dropout(dropout_prob)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(hidden_size,512)
      self.fc2 = nn.Linear(512,num_classes)
      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
      cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)
      return x

model = BERT_Arch(bert)
# Defining the hyperparameters (optimizer, weights of the classes and the epochs)
optimizer = AdamW(model.parameters(),lr = 1e-5)
cross_entropy  = nn.NLLLoss()
epochs = 2

### Training process

In [None]:
# Defining training and evaluation functions
def train(model, train_dataloader):
  model.train()
  total_loss, total_accuracy = 0, 0

  for step,batch in enumerate(train_dataloader):                # iterate over batches
    if step % 25 == 0 and not step == 0:                        # progress update after every 25 batches.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r for r in batch]                                  # push the batch to gpu
    sent_id, mask, labels = batch
    model.zero_grad()                                           # clear previously calculated gradients
    preds = model(sent_id, mask)                                # get model predictions for current batch
    loss = cross_entropy(preds, labels)                         # compute loss between actual & predicted values
    total_loss = total_loss + loss.item()                       # add on to the total loss
    loss.backward()                                             # backward pass to calculate the gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)     # clip gradients to 1.0. It helps in preventing exploding gradient problem
    optimizer.step()                                            # update parameters
    preds=preds.detach().cpu().numpy()                          # model predictions are stored on GPU. So, push it to CPU

  avg_loss = total_loss / len(train_dataloader)                 # compute training loss of the epoch
                                                                # reshape predictions in form of (# samples, # classes)
  return avg_loss                                 # returns the loss and predictions

def evaluate(model, val_dataloader):
  print("\nEvaluating...")
  model.eval()                                    # Deactivate dropout layers
  total_loss, total_accuracy = 0, 0
  for step,batch in enumerate(val_dataloader):    # Iterate over batches
    if step % 25 == 0 and not step == 0:          # Progress update every 25 batches.
                                                  # Calculate elapsed time in minutes.
                                                  # Elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
                                                  # Report progress
    batch = [t for t in batch]                    # Push the batch to GPU
    sent_id, mask, labels = batch
    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id, mask)                # Model predictions
      loss = cross_entropy(preds,labels)          # Compute the validation loss between actual and predicted values
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
  avg_loss = total_loss / len(val_dataloader)         # compute the validation loss of the epoch
  return avg_loss

In [None]:
# Train and predict
best_valid_loss = float('inf')
train_losses_no=[]                   # empty lists to store training and validation loss of each epoch
valid_losses_no=[]

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = train(model, train_dataloader)
    valid_loss = evaluate(model, val_dataloader)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'no_markers_model_weights.pt')
    train_losses_no.append(train_loss)

    pickle.dump(model, open('no_markers_model_weights.pkl','wb'))
    pickle.load(open('no_markers_model_weights.pkl','rb'))

    valid_losses_no.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 2
  Batch    25  of  1,001.
  Batch    50  of  1,001.
  Batch    75  of  1,001.
  Batch   100  of  1,001.
  Batch   125  of  1,001.
  Batch   150  of  1,001.
  Batch   175  of  1,001.
  Batch   200  of  1,001.
  Batch   225  of  1,001.
  Batch   250  of  1,001.
  Batch   275  of  1,001.
  Batch   300  of  1,001.
  Batch   325  of  1,001.
  Batch   350  of  1,001.
  Batch   375  of  1,001.
  Batch   400  of  1,001.
  Batch   425  of  1,001.
  Batch   450  of  1,001.
  Batch   475  of  1,001.
  Batch   500  of  1,001.
  Batch   525  of  1,001.
  Batch   550  of  1,001.
  Batch   575  of  1,001.
  Batch   600  of  1,001.
  Batch   625  of  1,001.
  Batch   650  of  1,001.
  Batch   675  of  1,001.
  Batch   700  of  1,001.
  Batch   725  of  1,001.
  Batch   750  of  1,001.
  Batch   775  of  1,001.
  Batch   800  of  1,001.
  Batch   825  of  1,001.
  Batch   850  of  1,001.
  Batch   875  of  1,001.
  Batch   900  of  1,001.
  Batch   925  of  1,001.
  Batch   950  of  1,001

### Saving the model as a pkl file - OPTIONAL

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
shutil.copy('/content/no_markers_model_weights.pkl', '/content/drive/MyDrive/NLP-FINAL/All-s_256-l')

'/content/drive/MyDrive/NLP-FINAL/All-s_256-l/no_markers_model_weights.pkl'

### Evaluation

In [None]:
# Load the model
file_path = '/content/drive/MyDrive/NLP-FINAL/All-s_256-l/no_markers_model_weights.pkl'
with open(file_path, 'rb') as file:
    model = pickle.load(file)

In [None]:
# Calculate the midpoint index to split the tensors
midpoint = len(test_seq) // 2

# Split each tensor into two parts
test_seq_1, test_seq_2 = np.split(test_seq, [midpoint])
test_mask_1, test_mask_2 = np.split(test_mask, [midpoint])
test_y_1, test_y_2 = np.split(test_y, [midpoint])

In [None]:
with torch.no_grad():
  preds = model(test_seq_1, test_mask_1)
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)

In [None]:
print(classification_report(test_y_1, preds))

In [None]:
with torch.no_grad():
  preds = model(test_seq_2, test_mask_2)
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)

In [None]:
print(classification_report(test_y_2, preds))

In [None]:
with torch.no_grad():
  preds = model(test_seq, test_mask)
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)

In [None]:
print(classification_report(test_y, preds))