In [11]:
import pandas as pd
import csv
from datasets import (
    Dataset, 
    load_dataset,
)

import os

import tqdm
from dotenv import load_dotenv

import torch
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, 
    TrainingArguments, Trainer, 
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)



In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.cuda.empty_cache()
device

device(type='cpu')

In [13]:
load_dotenv()

PARENT_DIR = os.environ.get("PARENT_DIR")

In [14]:
print("Loading model... ", end='', flush=True)
tokeniser = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
model.to(device)
print('Done')

Loading model... 

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 16fa366c-2613-4bcb-b4f0-0d1bc1a8c06d)')' thrown while requesting HEAD https://huggingface.co/microsoft/DialoGPT-small/resolve/main/tokenizer_config.json


Done


In [15]:
# raw_dataset = load_dataset("vicgalle/alpaca-gpt4", split="train")
# raw_dataset = raw_dataset.shuffle(seed=42).select(range(10))

raw_dataset = None
question = None
answer = None

with open(PARENT_DIR + "/data.csv", 'r') as file:
    csvreader = csv.reader(file)
    question = []
    answer = []
    for q, a in list(csvreader)[1:]:
        question.append(q)
        answer.append(a)

raw_dataset = Dataset.from_dict({"Question" : question, "Answer" : answer})

raw_dataset

Dataset({
    features: ['Question', 'Answer'],
    num_rows: 110
})

In [16]:
temp_lst = []

for row in tqdm.tqdm(raw_dataset, desc="Re-formatting dataset", unit=" rows", leave=False):
    temp_dict = {}
    temp_dict["text"] = row["Question"].strip() + tokeniser.eos_token + row["Answer"].strip() + tokeniser.eos_token
    temp_lst.append(temp_dict)

temp_df = pd.DataFrame(temp_lst, columns=["text"])
temp_df.dropna()

processed_dataset = Dataset.from_pandas(temp_df)
processed_dataset

                                                                 

Dataset({
    features: ['text'],
    num_rows: 110
})

In [17]:
tokeniser.pad_token = tokeniser.eos_token

def preprocess(example):
    return tokeniser(example["text"], padding=True, truncation=True)

tokenised_dataset = processed_dataset.map(preprocess)

tokenised_dataset = tokenised_dataset.remove_columns(["text"])
tokenised_dataset = tokenised_dataset.with_format("torch", columns=["input_ids", "attention_mask"])
tokenised_dataset = tokenised_dataset.train_test_split(test_size=0.1)
tokenised_dataset

Map: 100%|██████████| 110/110 [00:00<00:00, 1868.66 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 99
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 11
    })
})

In [18]:
print("Creating Data Collator...", end="")
data_collator = DataCollatorForLanguageModeling(tokeniser, mlm=False)
print("Done")

Creating Data Collator...Done


In [19]:
training_args = TrainingArguments(
    output_dir = PARENT_DIR + "models/test-model",
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_on_each_node = True,
    optim = "adamw_torch",
    report_to = "all",
    num_train_epochs = 10,
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_dataset["train"],
    eval_dataset=tokenised_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [20]:
trainer.train()
trainer.save_model(PARENT_DIR + "models/test-model/final")  

  0%|          | 0/130 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                
 10%|█         | 13/130 [01:03<07:56,  4.08s/it]

{'eval_loss': 5.1331562995910645, 'eval_runtime': 2.2003, 'eval_samples_per_second': 4.999, 'eval_steps_per_second': 0.909, 'epoch': 1.0}


                                                
 20%|██        | 26/130 [02:23<08:16,  4.77s/it]

{'eval_loss': 3.4117238521575928, 'eval_runtime': 2.6797, 'eval_samples_per_second': 4.105, 'eval_steps_per_second': 0.746, 'epoch': 2.0}


                                                
 30%|███       | 39/130 [04:01<09:50,  6.48s/it]

{'eval_loss': 2.5479893684387207, 'eval_runtime': 5.0957, 'eval_samples_per_second': 2.159, 'eval_steps_per_second': 0.392, 'epoch': 3.0}


                                                
 40%|████      | 52/130 [05:49<08:29,  6.53s/it]

{'eval_loss': 1.9792424440383911, 'eval_runtime': 2.8946, 'eval_samples_per_second': 3.8, 'eval_steps_per_second': 0.691, 'epoch': 4.0}


                                                
 50%|█████     | 65/130 [07:21<06:37,  6.12s/it]

{'eval_loss': 1.6881276369094849, 'eval_runtime': 2.949, 'eval_samples_per_second': 3.73, 'eval_steps_per_second': 0.678, 'epoch': 5.0}


                                                
 60%|██████    | 78/130 [09:15<05:35,  6.46s/it]

{'eval_loss': 1.5800838470458984, 'eval_runtime': 3.546, 'eval_samples_per_second': 3.102, 'eval_steps_per_second': 0.564, 'epoch': 6.0}


                                                
 70%|███████   | 91/130 [11:58<03:51,  5.93s/it]

{'eval_loss': 1.5237339735031128, 'eval_runtime': 2.9278, 'eval_samples_per_second': 3.757, 'eval_steps_per_second': 0.683, 'epoch': 7.0}


                                                 
 80%|████████  | 104/130 [13:35<02:59,  6.91s/it]

{'eval_loss': 1.4575823545455933, 'eval_runtime': 2.9554, 'eval_samples_per_second': 3.722, 'eval_steps_per_second': 0.677, 'epoch': 8.0}


                                                 
 90%|█████████ | 117/130 [15:33<01:28,  6.79s/it]

{'eval_loss': 1.4355829954147339, 'eval_runtime': 3.3556, 'eval_samples_per_second': 3.278, 'eval_steps_per_second': 0.596, 'epoch': 9.0}


                                                 
100%|██████████| 130/130 [17:47<00:00,  5.17s/it]

{'eval_loss': 1.4260185956954956, 'eval_runtime': 2.8516, 'eval_samples_per_second': 3.858, 'eval_steps_per_second': 0.701, 'epoch': 10.0}


100%|██████████| 130/130 [17:49<00:00,  8.23s/it]


{'train_runtime': 1069.6818, 'train_samples_per_second': 0.926, 'train_steps_per_second': 0.122, 'train_loss': 2.2059990516075723, 'epoch': 10.0}
