In [1]:
import pandas as pd
import csv
from datasets import Dataset

import os

import tqdm
from dotenv import load_dotenv

import torch
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, 
    TrainingArguments, Trainer, 
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from transformers.utils import logging

import datetime
import numpy as np
import random


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ignoring warnings, info and debug 
logging.set_verbosity_error()

In [3]:
# checking for nvidia gpu (more efficient)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.cuda.empty_cache()
device

device(type='cpu')

In [4]:
# loading environmental variables
load_dotenv()

PARENT_DIR = os.environ.get("PARENT_DIR")

In [5]:
# loading model from HuggingFace
print("Loading model... ", end='', flush=True)
tokeniser = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
model.to(device)
print('Done')

Loading model... Done


In [6]:
# loading dataset from Huggingface
# raw_dataset = load_dataset("vicgalle/alpaca-gpt4", split="train")
# raw_dataset = raw_dataset.shuffle(seed=42).select(range(10))

# loading dataset from csv file
raw_dataset = None
base_qa = []
augmented_qa = []

with open(PARENT_DIR + "src/data/base_data.csv", 'r') as file:
    csvreader = csv.reader(file)
    for q, a in list(csvreader)[1:]:
        base_qa.append((q,a))

with open(PARENT_DIR + "src/data/processed_data.csv", 'r') as file:
    csvreader = csv.reader(file)
    val = [*filter(lambda v: v, csvreader)][1:]
    for q, a in val:
        augmented_qa.append((q,a))
random.shuffle(augmented_qa)

# ensure that all of the base data is appended, limiting the augmented data as it is more inaccurate
qa = base_qa + base_qa + augmented_qa
random.shuffle(qa)

qa = qa[:2000]

raw_dataset = Dataset.from_dict({"Question" : [q for q, _ in qa], "Answer" : [a for _, a in qa]})

raw_dataset

Dataset({
    features: ['Question', 'Answer'],
    num_rows: 2000
})

In [7]:
raw_dataset[:5]

{'Question': ["Recite me about hci 's account .",
  'Can you picture me a map of hci ?',
  'What leisure time activity are worthy for soul concerned in the hci field ?',
  'Where can i pose notes ?',
  'When be hci founded ?'],
 'Answer': ["Hwa Chong Institution (HCI) is Singapore's independent school with a rich history of over 100 years.  The Institution is the culmination of the watershed merger in 2005 between the former Chinese High School (TCHS) and Hwa Chong Junior College (HCJC). The then Chinese High School (TCHS) was founded by Mr Tan Kah Kee in 1919 to cater to the needs of primary school leavers of the Chinese community in the region. It became the first Chinese-language secondary school in South-east Asia. Entering Singaporeâ€™s post-independence era, TCHS was designated by the Ministry of Education (MOE) as one of nine Special Assistance Plan (SAP) Schools in the nation in 1979, before turning independent in 1987.",
  'Sure! Here it is: https://www.hci.edu.sg/images/hci_m

In [8]:
# reformatting dataset for training
temp_lst = []

for row in tqdm.tqdm(raw_dataset, desc="Re-formatting dataset", unit=" rows"):
    temp_dict = {}
    temp_dict["text"] = row["Question"].strip() + tokeniser.eos_token + row["Answer"].strip() + tokeniser.eos_token
    temp_lst.append(temp_dict)

temp_df = pd.DataFrame(temp_lst, columns=["text"])
temp_df.dropna()

processed_dataset = Dataset.from_pandas(temp_df)
processed_dataset

Re-formatting dataset: 100%|██████████| 2000/2000 [00:00<00:00, 12916.52 rows/s]


Dataset({
    features: ['text'],
    num_rows: 2000
})

In [9]:
# tokenising dataset for training
tokeniser.pad_token = tokeniser.eos_token

def preprocess(example):
    return tokeniser(example["text"], padding=True, truncation=True)

tokenised_dataset = processed_dataset.map(preprocess)

tokenised_dataset = tokenised_dataset.remove_columns(["text"])
tokenised_dataset = tokenised_dataset.with_format("torch", columns=["input_ids", "attention_mask"])
tokenised_dataset = tokenised_dataset.train_test_split(test_size=0.1)
tokenised_dataset

Map: 100%|██████████| 2000/2000 [00:00<00:00, 2470.28 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [10]:
# creating data collator as substitute for label
print("Creating Data Collator...", end="")
data_collator = DataCollatorForLanguageModeling(tokeniser, mlm=False) # ensure that mode is clm 
print("Done")

Creating Data Collator...Done


In [11]:
# generating model id
model_id = "model-" + datetime.datetime.now().strftime("%H%M%S")
print("Model will be saved as :", f"`{model_id}`")

Model will be saved as : `model-132744`


In [12]:
# setting up arguments for training
training_args = TrainingArguments(
    output_dir = PARENT_DIR + f"models/{model_id}",
    overwrite_output_dir = True,
    disable_tqdm = False,
    do_eval = True,
    do_train = True,
    do_predict = True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_on_each_node = True,
    optim = "adamw_torch",
    report_to = "all",
    load_best_model_at_end = True,
    num_train_epochs = 5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_dataset["train"],
    eval_dataset=tokenised_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [13]:
# training model
train_output = trainer.train()

# saving model
print("Saving model...", end="")
trainer.save_model(PARENT_DIR + f"models/{model_id}/final")  
print("Done")

                                                    
 20%|██        | 225/1125 [31:07<2:05:44,  8.38s/it]

{'eval_loss': 0.36185622215270996, 'eval_runtime': 68.8416, 'eval_samples_per_second': 2.905, 'eval_steps_per_second': 0.363, 'epoch': 1.0}


                                                      
 40%|████      | 450/1125 [1:01:13<1:23:31,  7.42s/it]

{'eval_loss': 0.28516989946365356, 'eval_runtime': 67.5401, 'eval_samples_per_second': 2.961, 'eval_steps_per_second': 0.37, 'epoch': 2.0}


 44%|████▍     | 500/1125 [1:07:15<1:06:27,  6.38s/it]

{'loss': 0.7043, 'learning_rate': 2.777777777777778e-05, 'epoch': 2.22}


                                                      
 60%|██████    | 675/1125 [1:31:58<55:55,  7.46s/it]

{'eval_loss': 0.2631673812866211, 'eval_runtime': 69.0757, 'eval_samples_per_second': 2.895, 'eval_steps_per_second': 0.362, 'epoch': 3.0}


                                                      
 80%|████████  | 900/1125 [2:01:59<27:45,  7.40s/it]

{'eval_loss': 0.2520483732223511, 'eval_runtime': 67.2807, 'eval_samples_per_second': 2.973, 'eval_steps_per_second': 0.372, 'epoch': 4.0}


 89%|████████▉ | 1000/1125 [2:15:18<13:58,  6.71s/it] 

{'loss': 0.1862, 'learning_rate': 5.555555555555556e-06, 'epoch': 4.44}


                                                     
100%|██████████| 1125/1125 [2:31:58<00:00,  7.32s/it]

{'eval_loss': 0.2488832026720047, 'eval_runtime': 65.4016, 'eval_samples_per_second': 3.058, 'eval_steps_per_second': 0.382, 'epoch': 5.0}


100%|██████████| 1125/1125 [2:32:00<00:00,  8.11s/it]


{'train_runtime': 9120.6091, 'train_samples_per_second': 0.987, 'train_steps_per_second': 0.123, 'train_loss': 0.4142701212565104, 'epoch': 5.0}
Saving model...Done


In [16]:
# printing summary of model
data = {
    "global_step": train_output.global_step,
    "training_loss": train_output.training_loss,
    "metrics": train_output.metrics,
}

data["metrics"] = [(key, value) for key, value in train_output.metrics.items()]

print("== Training Completed ==")
for i, val in enumerate(data.items()):
    k, v = val
    if i < 2:
        print(" ".join([j.capitalize() for j in k.split("_")]), ":", round(v, 5))
    else:
        print(" ".join([j.capitalize() for j in k.split("_")]), ":")
        for f, s in v:
            print("  •", " ".join([j.replace("eval", "evaluation").capitalize() for j in f.split("_")]), ":", round(s, 5))

== Training Completed ==
Global Step : 1125
Training Loss : 0.41427
Metrics :
  • Train Runtime : 9120.6091
  • Train Samples Per Second : 0.987
  • Train Steps Per Second : 0.123
  • Train Loss : 0.41427
  • Epoch : 5.0
