In [1]:
import pandas as pd
import csv
from datasets import (
    Dataset, 
    load_metric,
)

import os

import tqdm
from dotenv import load_dotenv

import torch
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, 
    TrainingArguments, Trainer, 
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from transformers.utils import logging

import datetime
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
logging.set_verbosity_error()

NameError: name 'logging' is not defined

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.cuda.empty_cache()
device

device(type='cpu')

In [3]:
load_dotenv()

PARENT_DIR = os.environ.get("PARENT_DIR")
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

In [4]:
print("Loading model... ", end='', flush=True)
tokeniser = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
model.to(device)
print('Done')

Loading model... Done


In [5]:
# raw_dataset = load_dataset("vicgalle/alpaca-gpt4", split="train")
# raw_dataset = raw_dataset.shuffle(seed=42).select(range(10))

raw_dataset = None
question = []
answer = []

with open(PARENT_DIR + "src/data/base_data.csv", 'r') as file:
    csvreader = csv.reader(file)
    for q, a in list(csvreader)[1:]:
        question.append(q)
        answer.append(a)

with open(PARENT_DIR + "src/data/augmented_data.csv", 'r') as file:
    csvreader = csv.reader(file)
    val = [*filter(lambda v: v, csvreader)][1:]
    for q, a in val:
        question.append(q)
        answer.append(a)

question = question[:5]
answer = answer[:5]

raw_dataset = Dataset.from_dict({"Question" : question, "Answer" : answer})

raw_dataset

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/chong/Desktop/Coding/GitHub/The-Orientator-PW-2023/src/data/augmented_data.csv'

In [None]:
temp_lst = []

for row in tqdm.tqdm(raw_dataset, desc="Re-formatting dataset", unit=" rows"):
    temp_dict = {}
    temp_dict["text"] = row["Question"].strip() + tokeniser.eos_token + row["Answer"].strip() + tokeniser.eos_token
    temp_lst.append(temp_dict)

temp_df = pd.DataFrame(temp_lst, columns=["text"])
temp_df.dropna()

processed_dataset = Dataset.from_pandas(temp_df)
processed_dataset

In [None]:
tokeniser.pad_token = tokeniser.eos_token

def preprocess(example):
    return tokeniser(example["text"], padding=True, truncation=True)

tokenised_dataset = processed_dataset.map(preprocess)

tokenised_dataset = tokenised_dataset.remove_columns(["text"])
tokenised_dataset = tokenised_dataset.with_format("torch", columns=["input_ids", "attention_mask"])
tokenised_dataset = tokenised_dataset.train_test_split(test_size=0.1)
tokenised_dataset

In [None]:
print("Creating Data Collator...", end="")
data_collator = DataCollatorForLanguageModeling(tokeniser, mlm=False)
print("Done")

In [None]:
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
model_id = "model-" + datetime.datetime.now().strftime("%H%M%S")
print("Model will be saved as :", f"`{model_id}`")

In [None]:
training_args = TrainingArguments(
    do_train = True,
    do_eval = True,
    do_predict = True,
    output_dir = PARENT_DIR + f"models/{model_id}",
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_on_each_node = True,
    optim = "adamw_torch",
    report_to = "all",
    load_best_model_at_end = True,
    compute_metrics = compute_metrics
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_dataset["train"],
    eval_dataset=tokenised_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
train_output = trainer.train()
data = {
    "global_step": train_output.global_step,
    "training_loss": train_output.training_loss,
    "metrics": train_output.metrics,
}

data["metrics"] = [(key, value) for key, value in train_output.metrics.items()]

print("== Training Completed ==")
for i, k, v in enumerate(data.items()):
    if i < 2:
        print(" ".join([j.capitalize() for j in k.split("_")]), ":", round(v, 5))
    else:
        print(" ".join([j.capitalize() for j in k.split("_")]), ":")
        for f, s in v.items():
            print("\tâ€¢", " ".join([j.replace("eval", "evaluation").capitalize() for j in s.split("_")]))

In [None]:
print("Saving model...", end="")
trainer.save_model(PARENT_DIR + f"models/{model_id}/final")  
print("Done")