<a href="https://colab.research.google.com/github/BabeRush/NLP_final_project/blob/main/training_tagged.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training - tagged data

## Notebook preparation

### Configurations

In [1]:
# Checking high RAM from google
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [2]:
# Assigning and checking for GPU usage
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

### Imports & Installations

In [4]:
! pip install transformers



In [5]:
# IMPORTS
# General
import warnings
import random
import sys

# Math & Data organization
import numpy as np
import scipy as sp
import pandas as pd
import re

# Deep Learning
import transformers
from transformers import AutoModel, BertTokenizerFast , BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AdamW
from sklearn.metrics import classification_report

# Visualization
import seaborn as sns
from tqdm import tqdm
from tabulate import tabulate
import matplotlib.pyplot as plt

# Working with files
from google.colab import files
import pickle
import shutil

## Creating the datasets

In [7]:
!git clone https://github.com/BabeRush/NLP_final_project.git

Cloning into 'NLP_final_project'...
remote: Enumerating objects: 120, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 120 (delta 0), reused 0 (delta 0), pack-reused 117[K
Receiving objects: 100% (120/120), 107.57 MiB | 12.48 MiB/s, done.
Resolving deltas: 100% (54/54), done.
Updating files: 100% (22/22), done.


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Get the processed datasets

In [9]:
tag_X_ISOT_train = pd.read_pickle('/content/NLP_final_project/FN-classification/Datasets_Processes/Train/tag_X_ISOT_train.pkl')
tag_X_LIAR_train = pd.read_pickle('/content/NLP_final_project/FN-classification/Datasets_Processes/Train/tag_X_LIAR_train.pkl')
y_ISOT_train = pd.read_pickle('/content/NLP_final_project/FN-classification/Datasets_Processes/Train/y_ISOT_train.pkl')
y_LIAR_train = pd.read_pickle('/content/NLP_final_project/FN-classification/Datasets_Processes/Train/y_LIAR_train.pkl')

tag_X_ISOT_test = pd.read_pickle('/content/NLP_final_project/FN-classification/Datasets_Processes/Test/tag_X_ISOT_test.pkl')
tag_X_LIAR_test = pd.read_pickle('/content/NLP_final_project/FN-classification/Datasets_Processes/Test/tag_X_LIAR_test.pkl')
y_ISOT_test = pd.read_pickle('/content/NLP_final_project/FN-classification/Datasets_Processes/Test/y_ISOT_test.pkl')
y_LIAR_test = pd.read_pickle('/content/NLP_final_project/FN-classification/Datasets_Processes/Test/y_LIAR_test.pkl')

ISOT_train = pd.concat([tag_X_ISOT_train, y_ISOT_train], axis=1)
ISOT_test = pd.concat([tag_X_ISOT_test, y_ISOT_test], axis=1)
LIAR_train = pd.concat([tag_X_LIAR_train, y_LIAR_train], axis=1)
LIAR_test = pd.concat([tag_X_LIAR_test, y_LIAR_test], axis=1)


In [10]:
ISOT_test.head()

Unnamed: 0,title and text,subject,label
22216,[t] breaking gop chairman grassley ha enough d...,[s] news,0
27917,[t] failed gop candidate remembered hilarious ...,[s] news,0
25007,[t] mike penny new neighbor hilariously trolli...,[s] news,0
1377,[t] california pledge defend birth control ins...,[s] politicsnews,1
32476,[t] rancher living usmexico border destroy nan...,[s] politics,0


### Build finite dataset for training

In [11]:
def build_DS(df1, df2):
    combined_df = pd.concat([df1, df2], ignore_index=True)
    train = combined_df.sample(frac=1).reset_index(drop=True)
    train['text'] = train['subject'] + ' ' + train['title and text']
    train.drop(['subject', 'title and text'], axis=1, inplace=True)
    return train

train = build_DS(ISOT_train, LIAR_train)
test = build_DS(ISOT_test, LIAR_test)

In [12]:
test.head()

Unnamed: 0,label,text
0,1,[s] economy [t] president barack obama weve go...
1,0,[s] news [t] laura bush stuns republican party...
2,0,[s] news [t] voter confronts trump could daugh...
3,0,[s] politics [t] reince priebus interview spea...
4,1,[s] politicsnews [t] epa aim end trump freeze ...


### Prepare dataloaders

In [15]:
RANDOM_SEED = 42
MAX_LENGTH = 256
BATCH_SIZE = 32

In [17]:
def reduce_instances(df, sample_size, seed):
    return df.sample(n=sample_size, random_state=seed)

# Constant parameters
MINI_SAMPLE = 500
SMALL_SAMPLE = 5000
MEDIUM_SAMPLE = 15000
LARGE_SAMPLE = 30000

# Ask user if they want to reduce training set size
inp = input("Do you want to reduce the training set size? (y/n) -> ")

if inp.lower() == 'y':
    inp2 = input("Choose size from the following options:\n"
                    f"1 - Mini Sample ({MINI_SAMPLE})\n"
                    f"2 - Small Sample ({SMALL_SAMPLE})\n"
                    f"3 - Medium Sample ({MEDIUM_SAMPLE})\n"
                    f"4 - Large Sample ({LARGE_SAMPLE})\n"
                    "Enter the number of your choice -> ")

    # Choose the sample size based on user input
    if inp2 == '1':
        sample_size = MINI_SAMPLE
    elif inp2 == '2':
        sample_size = SMALL_SAMPLE
    elif inp2 == '3':
        sample_size = MEDIUM_SAMPLE
    elif inp2 == '4':
        sample_size = LARGE_SAMPLE
    else:
        print("Invalid choice. Using the default sample size.")
        sample_size = MEDIUM_SAMPLE

    # Reduce the training set size
    train = reduce_instances(train, sample_size, RANDOM_SEED)
    print(f"Training set size reduced to {sample_size} instances.")

    # Now you can use 'reduced_train' for further processing
elif inp.lower() == 'n':
    print("Training set size will not be reduced.")
else:
    print("Invalid input. Training set size will not be reduced.")

Do you want to reduce the training set size? (y/n) -> n
Training set size will not be reduced.


In [18]:
train.shape

(46149, 2)

In [19]:
# Train-Validation split
train_text, val_text, train_labels, val_labels = train_test_split(train['text'], train['label'],
                                                                    random_state=RANDOM_SEED,
                                                                    test_size=0.25,
                                                                    stratify=train['label'])
test_text = test['text']
test_labels = test['label']

In [20]:
print('Train size:',train_text.shape[0])
print('Val size:',val_text.shape[0])
print('Test size:',test_text.shape[0])

Train size: 34611
Val size: 11538
Test size: 11538


In [21]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
batch_size = BATCH_SIZE

tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)

# Convert lists to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

train_data = TensorDataset(train_seq, train_mask, train_y)    # wrap tensors
train_sampler = RandomSampler(train_data)                     # sampler for sampling the data during training
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
                                                              # dataLoader for train set
val_data = TensorDataset(val_seq, val_mask, val_y)            # wrap tensors
val_sampler = SequentialSampler(val_data)                     # sampler for sampling the data during training
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
                                                              # dataLoader for validation set

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [22]:
# Freezing the parameters and defining trainable BERT structure
for param in bert.parameters():
    param.requires_grad = False    # false here means gradient need not be computed

In [23]:
class BERT_Arch(nn.Module):
    def __init__(self, bert, num_classes=2, hidden_size=768 , dropout_prob=0.1):
      super(BERT_Arch, self).__init__()
      self.bert = bert
      self.dropout = nn.Dropout(dropout_prob)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(hidden_size,512)
      self.fc2 = nn.Linear(512,num_classes)
      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
      cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)
      return x

model = BERT_Arch(bert)
# Defining the hyperparameters (optimizer, weights of the classes and the epochs)
optimizer = AdamW(model.parameters(),lr = 1e-5)
cross_entropy  = nn.NLLLoss()
epochs = 2

### Training process

In [24]:
# Defining training and evaluation functions
def train(model, train_dataloader):
  model.train()
  total_loss, total_accuracy = 0, 0

  for step,batch in enumerate(train_dataloader):                # iterate over batches
    if step % 25 == 0 and not step == 0:                        # progress update after every 25 batches.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r for r in batch]                                  # push the batch to gpu
    sent_id, mask, labels = batch
    model.zero_grad()                                           # clear previously calculated gradients
    preds = model(sent_id, mask)                                # get model predictions for current batch
    loss = cross_entropy(preds, labels)                         # compute loss between actual & predicted values
    total_loss = total_loss + loss.item()                       # add on to the total loss
    loss.backward()                                             # backward pass to calculate the gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)     # clip gradients to 1.0. It helps in preventing exploding gradient problem
    optimizer.step()                                            # update parameters
    preds=preds.detach().cpu().numpy()                          # model predictions are stored on GPU. So, push it to CPU

  avg_loss = total_loss / len(train_dataloader)                 # compute training loss of the epoch
                                                                # reshape predictions in form of (# samples, # classes)
  return avg_loss                                 # returns the loss and predictions

def evaluate(model, val_dataloader):
  print("\nEvaluating...")
  model.eval()                                    # Deactivate dropout layers
  total_loss, total_accuracy = 0, 0
  for step,batch in enumerate(val_dataloader):    # Iterate over batches
    if step % 25 == 0 and not step == 0:          # Progress update every 25 batches.
                                                  # Calculate elapsed time in minutes.
                                                  # Elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
                                                  # Report progress
    batch = [t for t in batch]                    # Push the batch to GPU
    sent_id, mask, labels = batch
    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id, mask)                # Model predictions
      loss = cross_entropy(preds,labels)          # Compute the validation loss between actual and predicted values
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
  avg_loss = total_loss / len(val_dataloader)         # compute the validation loss of the epoch
  return avg_loss

In [None]:
# Train and predict
best_valid_loss = float('inf')
train_losses_no=[]                   # empty lists to store training and validation loss of each epoch
valid_losses_no=[]

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = train(model, train_dataloader)
    valid_loss = evaluate(model, val_dataloader)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'no_markers_model_weights.pt')
    train_losses_no.append(train_loss)

    pickle.dump(model, open('no_markers_model_weights.pkl','wb'))
    pickle.load(open('no_markers_model_weights.pkl','rb'))

    valid_losses_no.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 2
  Batch    25  of  1,001.
  Batch    50  of  1,001.
  Batch    75  of  1,001.
  Batch   100  of  1,001.
  Batch   125  of  1,001.
  Batch   150  of  1,001.
  Batch   175  of  1,001.
  Batch   200  of  1,001.
  Batch   225  of  1,001.
  Batch   250  of  1,001.
  Batch   275  of  1,001.
  Batch   300  of  1,001.
  Batch   325  of  1,001.
  Batch   350  of  1,001.
  Batch   375  of  1,001.
  Batch   400  of  1,001.
  Batch   425  of  1,001.
  Batch   450  of  1,001.
  Batch   475  of  1,001.
  Batch   500  of  1,001.
  Batch   525  of  1,001.
  Batch   550  of  1,001.
  Batch   575  of  1,001.
  Batch   600  of  1,001.
  Batch   625  of  1,001.
  Batch   650  of  1,001.
  Batch   675  of  1,001.
  Batch   700  of  1,001.
  Batch   725  of  1,001.
  Batch   750  of  1,001.
  Batch   775  of  1,001.
  Batch   800  of  1,001.
  Batch   825  of  1,001.
  Batch   850  of  1,001.
  Batch   875  of  1,001.
  Batch   900  of  1,001.
  Batch   925  of  1,001.
  Batch   950  of  1,001

### Saving the model as a pkl file

In [None]:
file_path = '/content/drive/MyDrive/All-data-untagged-256.pkl'

# Save the model using pickle
with open(file_path, 'wb') as file:
    pickle.dump(model, file)

### Evaluation

In [None]:
# # Load the model
# file_path = '/content/drive/MyDrive/NLP-FINAL/All-data-untagged-256.pkl'
# with open(file_path, 'rb') as file:
#     model = pickle.load(file)

In [None]:
# Divide Testset to quarters because of "Extra" RAM usage (I'm colab pro...)
quarter_size = len(test_seq) // 4

# Split the test data and labels into quarters
test_seq_1, test_seq_2, test_seq_3, test_seq_4 = np.split(test_seq, [quarter_size, quarter_size*2, quarter_size*3])
test_mask_1, test_mask_2, test_mask_3, test_mask_4 = np.split(test_mask, [quarter_size, quarter_size*2, quarter_size*3])
test_y_1, test_y_2, test_y_3, test_y_4 = np.split(test_y, [quarter_size, quarter_size*2, quarter_size*3])

In [None]:
test_seq_parts = [test_seq_1, test_seq_2, test_seq_3, test_seq_4]
test_mask_parts = [test_mask_1, test_mask_2, test_mask_3, test_mask_4]
test_y_parts = [test_y_1, test_y_2, test_y_3, test_y_4]

for test_seq_part, test_mask_part, test_y_part in zip(test_seq_parts, test_mask_parts, test_y_parts):
    with torch.no_grad():
        preds = model(test_seq_part, test_mask_part)
        preds = preds.detach().cpu().numpy()

    preds = np.argmax(preds, axis=1)
    print("Test Evaluation:")
    print(classification_report(test_y_part, preds))

KeyboardInterrupt: ignored