#  BART  Classifier 



In [2]:
#!pip install psutil
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Collecting psutil
  Downloading psutil-5.9.1-cp37-cp37m-win_amd64.whl (246 kB)
Installing collected packages: psutil
Successfully installed psutil-5.9.1
Your runtime has 12.9 gigabytes of available RAM

To enable a high-RAM runtime, select the Runtime > "Change runtime type"
menu, and then select High-RAM in the Runtime shape dropdown. Then, 
re-execute this cell.


In [39]:
!pip install -q transformers

In [40]:
# Libraries and transformers models
import torch
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm
from transformers import BartTokenizer, BartForSequenceClassification
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import random
from sklearn.model_selection import train_test_split

In [71]:
# loading dataset directly from google drive
from google.colab import drive
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/MyDrive/DL_1/data/YTS.csv')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [74]:
train_df, test_df = train_test_split(df, random_state = 42, train_size = 0.8, stratify = df.CLASS.values)
train_df.shape, test_df.shape

((280, 2), (70, 2))

In [75]:
train_df, valid_df = train_test_split(train_df, random_state = 42, train_size = 0.8, stratify = train_df.CLASS.values)
train_df.shape, test_df.shape, valid_df.shape

((224, 2), (70, 2), (56, 2))

In [76]:
possible_labels = train_df.CLASS.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [77]:
#creating the tokenizer instance
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

In [78]:
#encoding 
encoded_data_train = tokenizer.batch_encode_plus( 
    train_df.CONTENT.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding = 'max_length',
    truncation = True,
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']

labels_train = torch.tensor(train_df.CLASS.values)

In [81]:
#encoding
encoded_data_val = tokenizer.batch_encode_plus(
    valid_df.CONTENT.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding = 'max_length',
    truncation = True,
    max_length=256, 
    return_tensors='pt'
)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']

labels_val = torch.tensor(valid_df.CLASS.values)

In [84]:
#encoding
encoded_data_test = tokenizer.batch_encode_plus(
    test_df.CONTENT.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding = 'max_length',
    truncation = True,
    max_length=256, 
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

labels_test = torch.tensor(test_df.CLASS.values)

In [88]:

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [90]:
batch_size = 20

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

In [91]:
model = BartForSequenceClassification.from_pretrained('facebook/bart-base',
                                                      num_labels=len(train_df.CLASS.unique()),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

Some weights of the model checkpoint at facebook/bart-base were not used when initializing BartForSequenceClassification: ['final_logits_bias']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to u

In [96]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [98]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [99]:
#using GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [100]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [101]:
torch.cuda.empty_cache
#emptying GPU Cache
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BART_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {round(loss_train_avg, 2)}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {round(val_loss,2)}')
    tqdm.write(f'F1 Score (Weighted): {round(val_f1, 2)}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=12.0, style=ProgressStyle(description_width…


Epoch 1
Training loss: 0.65
Validation loss: 0.64
F1 Score (Weighted): 0.61


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=12.0, style=ProgressStyle(description_width…


Epoch 2
Training loss: 0.54
Validation loss: 0.52
F1 Score (Weighted): 0.75


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=12.0, style=ProgressStyle(description_width…


Epoch 3
Training loss: 0.39
Validation loss: 0.34
F1 Score (Weighted): 0.87


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=12.0, style=ProgressStyle(description_width…


Epoch 4
Training loss: 0.24
Validation loss: 0.22
F1 Score (Weighted): 0.87


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=12.0, style=ProgressStyle(description_width…


Epoch 5
Training loss: 0.15
Validation loss: 0.14
F1 Score (Weighted): 0.96


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=12.0, style=ProgressStyle(description_width…


Epoch 6
Training loss: 0.07
Validation loss: 0.07
F1 Score (Weighted): 0.96


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=12.0, style=ProgressStyle(description_width…


Epoch 7
Training loss: 0.08
Validation loss: 0.06
F1 Score (Weighted): 0.96


HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=12.0, style=ProgressStyle(description_width…


Epoch 8
Training loss: 0.04
Validation loss: 0.06
F1 Score (Weighted): 0.96


HBox(children=(FloatProgress(value=0.0, description='Epoch 9', max=12.0, style=ProgressStyle(description_width…


Epoch 9
Training loss: 0.04
Validation loss: 0.06
F1 Score (Weighted): 0.96


HBox(children=(FloatProgress(value=0.0, description='Epoch 10', max=12.0, style=ProgressStyle(description_widt…


Epoch 10
Training loss: 0.03
Validation loss: 0.05
F1 Score (Weighted): 0.96



In [109]:
#evaluating with validation set
_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Class: 1
Accuracy: 27/28

Class: 0
Accuracy: 27/28



In [111]:
#evaluating test set 
_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals)

Class: 1
Accuracy: 33/35

Class: 0
Accuracy: 35/35

