# BERT 

### Connect colab with drive

In [None]:
# from google.colab import files
# uploaded = files.upload()


#IMPORT FILES FROM DRIVE INTO GOOGLE-COLAB:

#STEP-1: Import Libraries

# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

#STEP-2: Autheticate E-Mail ID

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#STEP-3: Get File from Drive using file-ID

#2.1 Get the file
downloaded = drive.CreateFile({'id':'1UEl2hUf4t8iuHUf_SE7ExlCeWyLS0_Wu'}) # replace the id with id of file you want to linkable link and delete from 'https....id='
downloaded.GetContentFile('ibm_0k-60k.csv')  # file name to be imported to colab

## 1. Check Device

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

## 2. Install pytorch interface for bert model and required libraries
At the moment, the Hugging Face library seems to be the most widely accepted and powerful pytorch interface for working with BERT.

In [None]:
# ! pip install pytorch-pretrained-bert pytorch-nlp

In [None]:
! pip install transformers

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import BertTokenizer, BertConfig
from transformers import BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import random
import io
import matplotlib.pyplot as plt
torch.cuda.empty_cache()
import transformers
from transformers import get_linear_schedule_with_warmup

% matplotlib inline

In [None]:
seed_val = 42

# # Set the seed value all over the place to make this reproducible.
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device.

In [None]:
torch.cuda.empty_cache()

## 3. Load Data

### Upload file


In [None]:
# df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
df = pd.read_csv("ibm_0k-60k.csv")
df = pd.DataFrame(df)
df.head()

In [None]:
# df = df.sample(1000)

df.Text = df.Text.astype(str)

Our target variable is 'ibm_sent' which contain 7 sentiments. So this is  a multiclass classification problem.

### Converting labels to proper format

Code for Label encoding

In [None]:
df.ibm_sent = df.ibm_sent.astype(str)
df["ibm_sent"]= df["ibm_sent"].replace("nan", "Neutral") 
t = df.ibm_sent.unique()
print(t)

def la(sent):
  if sent == "Sadness":
    return 0
  if sent == "Confident":
    return 1
  if sent == "Neutral":
    return 2
  if sent == "Joy":
    return 3
  if sent == "Analytical":
    return 4
  if sent == "Anger":
    return 5
  if sent == "Fear":
    return 6

df['ibm_sent_cat'] = df['ibm_sent'].apply(la)
labels = df['ibm_sent_cat'].values


Code for converting the labels to one hot encoding

In [None]:
def one_hot(sent):
  temp = []
  for i in range(0,7):
    if sent == i:
      temp.append(1)
    else:
      temp.append(0)
  return temp

df['one_hot_sent'] = df['ibm_sent_cat'].apply(one_hot)
df['one_hot_sent'].head()

## Tokenization

### configs

In [None]:
MAX_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_batch_size = 32
test_batch_size = 32
epoches = 2

In [None]:
class CustomDataset:

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.Text.values
        self.targets = dataframe.one_hot_sent.values
        # print(self.targets)
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

### Data split into validation and train


In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(df,tokenizer, MAX_LEN)
testing_set = CustomDataset(df, tokenizer, MAX_LEN)


In [None]:
train_params = {'batch_size' : train_batch_size,
                 'shuffle' : True,
                 'num_workers' : 0}
test_params = {'batch_size' : test_batch_size,
                 'shuffle' : True,
                 'num_workers' : 0}

train_loader = DataLoader(training_set, **train_params)
test_loader = DataLoader(testing_set, **test_params)

In [None]:
len(train_loader)

In [None]:
for step, data in enumerate(train_loader):
  print(data['ids'])
  break

In [None]:
data['ids'].size()

## BertForSequenceClassification

In [None]:
import gc 

# Your code with pytorch using GPU

gc.collect()


In [None]:
class BERTclass(torch.nn.Module):
    def __init__(self):
        super(BERTclass, self).__init__()
        # self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        # self.l2 = torch.nn.Dropout(0.3)
        # self.l3 = torch.nn.Linear(768, 7)
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 7)
    
    def forward(self, ids, mask, token_type_ids, labels):
        # _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        # output_2 = self.l2(output_1)
        # output = self.l3(output_2)
        output = self.model(input_ids = ids, attention_mask= mask)
        return output


model = BERTclass()
model.to(device)

In [None]:
# del model


Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.

For the purposes of fine-tuning, the authors recommend the following hyperparameter ranges:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

## Optimizer & Learning Rate Scheduler

### Optimizer

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = transformers.AdamW(optimizer_grouped_parameters,
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


Helper function for formatting elapsed times.

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

### Accuracy 

In [None]:
def acc(preds, labels):
  pred_flat = np.argmax(preds, axis=1) # np.argmax - Returns the indices of the maximum values along an axis.
  labels_flat = labels.flatten() # Flatten - Return a copy of the array collapsed into one dimension.
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

### Scheduler

In [None]:
# Total number of training steps is number of batches * number of epochs.
total_steps = len(training_set) * epoches

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import random
seed_val = 42

# # Set the seed value all over the place to make this reproducible.

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
def loss_fn(outputs, targets):
  # print(f'o/p : {outputs}, targets  : {targets.size()}')
  return torch.nn.BCEWithLogitsLoss()(outputs, targets)

## Train Model

In [None]:
def train_fn(epoch):
    for step, data in enumerate(train_loader, 0):

        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        model.zero_grad()
        outputs = model(ids, mask, token_type_ids, labels=targets)
        
        # outputs = outputs[0].detach().cpu()
        # targets = targets.detach().cpu()
        loss = loss_fn(outputs[0], targets)
        
        if step%100 ==0 and not step != 0:
            print(f'epoch : {epoch}    loss : {loss}')

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

def val_fn(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
# train
for ep in range(epoches):
    train_fn(ep)
  

In [None]:
# validation
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

## Saving & Loading Fine-Tuned Model


### To colab itself

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

### To Drive

In [None]:
from google.colab import drive
drive.mount('drive')
dr = "drive/My Drive/BERT/"

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(dr)
tokenizer.save_pretrained(dr)

### Load Model

The following functions will load the model back from disk.

In [None]:
# Load a trained model and vocabulary that you have fine-tuned
model = model_class.from_pretrained(output_dir)
tokenizer = tokenizer_class.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)