In [1]:
import pandas as pd
import evaluate
import numpy as np
import matplotlib.pyplot as plt

# from sklearn.metrics import precision_recall_fscore_support, accuracy_score, log_loss
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorWithPadding, DistilBertModel
from datasets import Dataset as dsDataset, DatasetDict as dsDatasetDict
from copy import deepcopy
import torch
from torch import nn
from torch import cuda
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm

import time


In [2]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
TRAIN_CSV = "data/train.csv"
VALID_CSV = "data/valid.csv"

PRE_TRAINED_MODEL = "distilbert-base-uncased"

SEED = 4332
torch.manual_seed(SEED)

MAP_DOWN = {
    1: 0,
    2: 1,
    3: 2,
    4: 3,
    5: 4
}

MAP_UP = {
    0: 1,
    1: 2,
    2: 3,
    3: 4,
    4: 5
}


id2label = {0: "WORST", 1: "BAD", 2:"NEUTRAL", 3: "GOOD", 4:"EXCELLENT"}
label2id = {"WORST": 0, "BAD": 1, "NEUTRAL": 2, "GOOD": 3, "EXCELLENT": 4}


tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL)
# tokenizer = AutoTokenizer.from_pretrained("roberta-base")


In [4]:
def load_data(file_path):
    df = pd.read_csv(file_path)[['text','label']]
    df['label'] = df['label'].map(MAP_DOWN)
    return df.to_dict('records')

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


In [5]:
train_data = load_data(TRAIN_CSV)
valid_data = load_data(VALID_CSV)

# Create a Dataset Dictionary object for hugging face's pipeline
data = dsDatasetDict({"train": dsDataset.from_list(train_data), "validation": dsDataset.from_list(valid_data)})
tokenized_data = data.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [6]:
tokenized_data = tokenized_data.remove_columns(["text"])
tokenized_data = tokenized_data.rename_column("label", "labels")
tokenized_data['train'].column_names

['labels', 'input_ids', 'attention_mask']

In [7]:
train_dataloader = DataLoader(
    tokenized_data["train"], batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_data["validation"], batch_size=8, collate_fn=data_collator
)

In [8]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 54]),
 'attention_mask': torch.Size([8, 54])}

In [9]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [10]:
# MAX_LEN = 512
# TRAIN_BATCH_SIZE = 8
# VALID_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 1e-05

In [11]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [12]:
# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [13]:
for _,data in enumerate(train_dataloader, 0):
    print(data.keys())
    break

dict_keys(['labels', 'input_ids', 'attention_mask'])


In [14]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(train_dataloader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        # if _%5000==0:
        #     loss_step = tr_loss/nb_tr_steps
        #     accu_step = (n_correct*100)/nb_tr_examples 
        #     print(f"Training Loss per 5000 steps: {loss_step}")
        #     print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    # print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    # print(f"Training Loss Epoch: {epoch_loss}")
    # print(f"Training Accuracy Epoch: {epoch_accu}")

    return epoch_loss, epoch_accu

In [15]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0
    nb_tr_steps = 0
    n_correct = 0
    nb_tr_examples = 0
    n_wrong = 0
    total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            # if _%5000==0:
            #     loss_step = tr_loss/nb_tr_steps
            #     accu_step = (n_correct*100)/nb_tr_examples
            #     print(f"Validation Loss per 100 steps: {loss_step}")
            #     print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    # print(f"Validation Loss Epoch: {epoch_loss}")
    # print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_loss, epoch_accu


In [16]:
print("Start Training")
st = time.time()
for epoch in range(EPOCHS):
    train_loss, train_acc = train(epoch)
    valid_loss, valid_acc =  valid(model, eval_dataloader)
    print(f"Epoch: {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}, Train Acc: {train_acc}, Train Loss: {valid_loss}, Train Acc: {valid_acc}")
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')


Start Training
Epoch: 1/5, Train Loss: 1.0958697397708892, Train Acc: 52.98888888888889, Train Loss: 0.9925498106479644, Train Acc: 57.1
Epoch: 2/5, Train Loss: 0.9120584240754446, Train Acc: 61.794444444444444, Train Loss: 0.9960116058588028, Train Acc: 58.5
Epoch: 3/5, Train Loss: 0.7551529698504342, Train Acc: 69.33888888888889, Train Loss: 1.0864587901830673, Train Acc: 57.85
Epoch: 4/5, Train Loss: 0.5959751455485821, Train Acc: 77.46111111111111, Train Loss: 1.2142874684333802, Train Acc: 56.65
Epoch: 5/5, Train Loss: 0.4417688514027331, Train Acc: 83.90555555555555, Train Loss: 1.3909447259902954, Train Acc: 54.85
Execution time: 506.3919596672058 seconds


In [19]:
# Saving the files for re-use

output_model_file = 'models/pytorch_distilbert_news.pt'
output_vocab_file = 'models'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved


In [18]:
# https://hkustconnect-my.sharepoint.com/:f:/g/personal/cceli_connect_ust_hk/EiHpZG56aoNCkHHH5UT32zMBbRv1fRysAYv5Kh3-40xMOQ?e=3oC977

# To do
- Save best model
- Plot graph
- Inference on test.csv