# Fine tuning
## **SNLP team project**

In this notebook, we finetune the models.

**Notebook implements following functionality:**
* Setup
* Define methods and classes
* Load preprocessed tweet data 
* Tokenize tweets
* Finetune / train model

## Setup


In [73]:
!pip install -q -U watermark
!pip install -qq transformers

%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.7.10
IPython version      : 5.5.0

numpy       : 1.19.5
pandas      : 1.1.5
torch       : 1.8.1+cu101
transformers: 4.5.1



In [74]:
# External utils
import csv
import time
import torch
import pickle 
import pandas as pd
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Torch libs
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModel
from transformers.optimization import get_linear_schedule_with_warmup
from transformers import BertModel

# Google drive integration
from google.colab import drive

In [75]:
#@title Setup & Config
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [76]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


## Define classes

In [77]:
class TwitterClimateDataset(Dataset):

  def __init__(self, df_clean_sample, tokenizer, max_len):
    self.tweets = df_clean_sample['content'].values
    self.clusters = df_clean_sample['Cluster'].values
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.tweets)
  
  def __getitem__(self, item):
    tweet = str(self.tweets[item])
    cluster = self.clusters[item]

    encoding = self.tokenizer.encode_plus(
      tweet,
      padding='max_length',
      truncation=True,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'content': tweet,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'clusters': torch.tensor(cluster, dtype=torch.long)
    }

class SentimentClassifier(nn.Module):

  def __init__(self, n_classes, dropout):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=dropout)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask, **kwargs):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output[1])
    return self.out(output)

## Define routines

In [78]:
def create_data_loader(df, tokenizer, batch_size):
  ds = TwitterClimateDataset(
    df,
    tokenizer=tokenizer,
    max_len=MAX_SEQ_LEN
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

def train_epoch(
    model, 
    data_loader, 
    val_loader,
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    n_examples_train,
    n_examples_val
  ):
  model = model.train()
  train_losses = []
  correct_predictions = 0
  train_accs = []
  val_accs = []
  val_losses = []
  batch_nums = []
  num_batches = len(data_loader)
  every_n_batches = int(np.floor(num_batches * 0.05))
  for i, d in enumerate(data_loader):
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["clusters"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    if i % every_n_batches == 0:
        val_acc, val_loss = eval_model(model, val_loader, loss_fn, device, n_examples_val)
        train_losses.append(loss.item())
        train_acc = torch.sum(preds == targets) / preds.shape[0]
        train_accs.append(train_acc.cpu().item())
        val_accs.append(val_acc.cpu().item())
        val_losses.append(val_loss)
        batch_nums.append(i)
        print(f"[batch {batch_nums[-1]}/{num_batches}]\n training loss: {train_losses[-1]},\n training acc: {train_accs[-1]},\n val_loss: {val_losses[-1]},\n val_acc: {val_accs[-1]}\n\n")
    correct_predictions += torch.sum(preds == targets)
    #train_losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples_train, train_losses, train_accs, val_accs, val_losses, batch_nums

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  num_batches = len(data_loader)
  every_n_batches = int(np.floor(num_batches * 0.05))
  with torch.no_grad():
    for i, d in enumerate(data_loader):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["clusters"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      if i % every_n_batches == 0:
        # print(f"[batch {i}/{num_batches}] validation loss: {loss.item()}")
        losses.append(loss.item())
      correct_predictions += torch.sum(preds == targets)
      #losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

def print_model_stats(history, test_acc):
    print('Training accuracy:', history['train_acc'])
    print('Training loss:', history['train_loss'])
    print('Validation accuracy:', history['val_acc'])
    print('Validation loss:', history['val_loss'])
    print(50*'--')
    print('Highest validation accuracy and epoch:', np.max(history['val_acc']), np.argmax(history['val_acc'])+1)
    print('Final test accuracy:', test_acc)

def encode_tweets(tweets):
  return [(tokenizer.encode_plus(
      tweet,
      add_special_tokens=True,
      max_length=MAX_SEQ_LEN,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt'
), tweet) for tweet in tweets]

def predict_sentiment(encoded_tweets, n_sample):
  pred_labels = np.empty(n_sample)
  for i, (encoded_tweet, tweet) in enumerate(encoded_tweets):

    input_ids = encoded_tweet['input_ids'].to(device)
    attention_mask = encoded_tweet['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    
    pred_labels[i] = prediction.item()
        
  return pred_labels

## Define hyperparameters and env vars

In [79]:
# Define training vs evaluating
skip_training = True

# Hyperparams
LR = 2e-5 # Learning rate
EPOCHS = 2
DROPOUT = 0.3
N_CLUSTERS = 2
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
WARMUP_STEPS = 0
TEST_TRAIN_SPLIT = (0.2,0.8)
TEST_VALIDATION_SPLIT = (0.5,0.5)
RANDOM_SEED = 1234

# Paths and model names
DATA_FOLDER_PATH = 'data/preprocessed_tweets/'
PROJECT_PATH = "/content/gdrive/MyDrive/SNLP project/"

PRE_TRAINED_MODEL_NAME = 'digitalepidemiologylab/covid-twitter-bert-v2'
SAVE_MODEL_NAME = 'covid-twitter-bert-v2'

## Load and prepare data

In [80]:
df_clean_sample = pd.read_csv(PROJECT_PATH + DATA_FOLDER_PATH + 'df_clean_sampled_binary.csv')

In [81]:
df_train, df_test = train_test_split(df_clean_sample, test_size=TEST_TRAIN_SPLIT[0], random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=TEST_VALIDATION_SPLIT[0], random_state=RANDOM_SEED)

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
train_data_loader = create_data_loader(df_train, tokenizer, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, BATCH_SIZE)

df_train.shape, df_val.shape, df_test.shape

((39678, 9), (4960, 9), (4960, 9))

## Train the model

In [82]:
%%time
if not skip_training:
  model = SentimentClassifier(N_CLUSTERS, DROPOUT)
  model = model.to(device)

  optimizer = AdamW(model.parameters(), lr=LR)
  total_steps = len(train_data_loader) * EPOCHS

  scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
  )

  loss_fn = nn.CrossEntropyLoss().to(device)
  
  history = defaultdict(list)

  for epoch in range(2):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_losses, train_accs, val_accs, val_losses, batch_nums = train_epoch(
      model,
      train_data_loader,
      val_data_loader,
      loss_fn, 
      optimizer, 
      device, 
      scheduler, 
      df_train.shape[0],
      df_val.shape[0]
    )

    test_acc, _ = eval_model(
      model,
      test_data_loader,
      loss_fn,
      device,
      len(df_test)
    )

    # Append to history
    history['train_acc'].append(train_acc.cpu().numpy()) 
    history['train_loss'].append(train_losses)
    history['val_acc'].append(val_accs)
    history['val_loss'].append(val_losses)
    history['test_acc'].append(test_acc.cpu().numpy())
    
    # Save model state
    models_path = PROJECT_PATH + 'models/' 
    torch.save(model.state_dict(), models_path + f'{SAVE_MODEL_NAME}_epoch_{epoch}.bin')

  # Save history
  with open(PROJECT_PATH + '/histories/' + f'{SAVE_MODEL_NAME}-history.pickle', 'wb') as file:
    pickle.dump(history, file)

  print_model_stats(history, test_acc.cpu().numpy())
  print(50*'--')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs
