In [1]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# Define the path to your dataset in Google Drive
file_path = '/content/drive/MyDrive/dialogs.txt'


Mounted at /content/drive


In [3]:
# Load the dataset (adjust the separator based on your dataset's structure)
dialogs = []
with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split("\t")  # Splitting by tabs if input and response are tab-separated
        if len(parts) == 2:  # Ensure the line has both input and response
            dialogs.append({'input': parts[0], 'response': parts[1]})

# Convert to a DataFrame-like structure for easy handling
import pandas as pd
dialogs_df = pd.DataFrame(dialogs)

# Preview the dataset
print(dialogs_df.head())


                                 input  \
0               hi, how are you doing?   
1        i'm fine. how about yourself?   
2  i'm pretty good. thanks for asking.   
3    no problem. so how have you been?   
4     i've been great. what about you?   

                                   response  
0             i'm fine. how about yourself?  
1       i'm pretty good. thanks for asking.  
2         no problem. so how have you been?  
3          i've been great. what about you?  
4  i've been good. i'm in school right now.  


In [4]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add special tokens for start of sentence, end of sentence, and padding
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'eos_token': '[EOS]', 'bos_token': '[BOS]'})

# Preprocess function to tokenize the input and response pairs
def preprocess_data(data):
    inputs = [f"[BOS] {input} [EOS]" for input in data['input']]
    responses = [f"[BOS] {response} [EOS]" for response in data['response']]

    # Tokenize input and response
    input_tokens = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=512)
    response_tokens = tokenizer(responses, return_tensors="pt", padding=True, truncation=True, max_length=512)

    return input_tokens, response_tokens

# Preprocess the dialogues
input_tokens, response_tokens = preprocess_data(dialogs_df)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [5]:
import torch
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# Load GPT-2 model with language modeling head
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to accommodate added tokens

# Prepare dataset for training
class DialogDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create the dataset object
dataset = DialogDataset(input_tokens['input_ids'], input_tokens['attention_mask'], response_tokens['input_ids'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=500,
    evaluation_strategy="steps"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset  # In a real project, split train and eval datasets
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss,Validation Loss
500,3.548,2.412985
1000,2.4424,2.351411
1500,2.4375,2.286484
2000,2.3621,2.253518
2500,2.29,2.222746
3000,2.277,2.200047
3500,2.2409,2.179133
4000,2.265,2.143562
4500,2.1817,2.128317
5000,2.1659,2.13439


TrainOutput(global_step=5589, training_loss=2.3956131033318098, metrics={'train_runtime': 20714.5307, 'train_samples_per_second': 0.539, 'train_steps_per_second': 0.27, 'total_flos': 148278124800000.0, 'train_loss': 2.3956131033318098, 'epoch': 3.0})

In [6]:
# Function to generate a response
def generate_response(input_text):
    model.eval()
    input_ids = tokenizer(f"[BOS] {input_text} [EOS]", return_tensors='pt').input_ids
    response_ids = model.generate(input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(response_ids[0], skip_special_tokens=True)

# Test the chatbot
test_input = "How are you doing?"
print(generate_response(test_input))


 How are you doing?.   


In [7]:
# Save the model to Google Drive
model_save_path = '/content/drive/MyDrive/path_to_save_model/gpt2_finetuned'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('/content/drive/MyDrive/path_to_save_model/gpt2_finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/path_to_save_model/gpt2_finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/path_to_save_model/gpt2_finetuned/vocab.json',
 '/content/drive/MyDrive/path_to_save_model/gpt2_finetuned/merges.txt',
 '/content/drive/MyDrive/path_to_save_model/gpt2_finetuned/added_tokens.json')