In [17]:
import pandas as pd
import os
import numpy as np 
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import re

In [18]:
pd.set_option('display.max_colwidth', None)

data = pd.read_csv('/kaggle/input/dataset-for-dialogpt/output.csv')

print(data.shape)

data.head()

(101034, 2)


Unnamed: 0,Input,Output
0,I would love to try the local food with my friend. i am quiet but confident. I love to watch movies with my dad on a rainy day. i try to limit how much i eat. I just finished practicing my bass guitar in the lifeguard station. Hey there!,What's your name?
1,I would love to try the local food with my friend. i am quiet but confident. I love to watch movies with my dad on a rainy day. i try to limit how much i eat. I just finished practicing my bass guitar in the lifeguard station. My name is William.,Nice to meet you Gavin. What kind of movies do you like to watch?
2,I would love to try the local food with my friend. i am quiet but confident. I love to watch movies with my dad on a rainy day. i try to limit how much i eat. I just finished practicing my bass guitar in the lifeguard station. I like to watch movies that make me feel something. I like to be able to relate to the characters and feel their emotions.,I can relate to that. I like to watch movies that make me think about things in a different way.
3,I would love to try the local food with my friend. i am quiet but confident. I love to watch movies with my dad on a rainy day. i try to limit how much i eat. I just finished practicing my bass guitar in the lifeguard station. That's a good way to put it. I like to be challenged by movies.,What are some of your favorite movies?
4,"I would love to try the local food with my friend. i am quiet but confident. I love to watch movies with my dad on a rainy day. i try to limit how much i eat. I just finished practicing my bass guitar in the lifeguard station. I have a lot of favorite movies, but some of my favorites include ""The Shawshank Redemption,"" ""The Godfather,"" and ""The Lord of the Rings.""","Those are all great movies! I love ""The Shawshank Redemption"" too. It's one of my favorites."


In [19]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

tokenizer.pad_token = tokenizer.eos_token

In [20]:
from datasets import Dataset

X = []
Y = []

def tokenize(text_in, text_out):
    inputs = tokenizer(text_in, padding="max_length", truncation=True, return_tensors="pt", max_length=256)
    labels = tokenizer(text_out, padding="max_length", truncation=True, return_tensors="pt", max_length=256)["input_ids"]
    
    X.append({
        'input_ids': inputs["input_ids"].squeeze(0),
        'attention_mask': inputs["attention_mask"].squeeze(0)
    })
    Y.append(labels.squeeze(0))

for index, row in data.iterrows():
    tokenize(row['Input'], row['Output'])
    if len(X) == 10000 and len(Y) == 10000:  
        break  

In [21]:
dataset = Dataset.from_dict({
    "input_ids": [x["input_ids"].clone().detach().tolist() for x in X],  # Convert tensors to lists
    "attention_mask": [x["attention_mask"].clone().detach().tolist() for x in X],
    "labels": [y.clone().detach().tolist() for y in Y]
})

In [22]:
split_dataset = dataset.train_test_split(test_size=0.2, shuffle = True)

train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

In [23]:
# Check lengths
print(f"Train dataset length: {len(train_dataset)}")
print(f"Validation dataset length: {len(val_dataset)}")

# Print a sample to verify correctness
# print(f"Train dataset sample: {train_dataset[0]}")
# print(f"Validation dataset sample: {val_dataset[0]}")

Train dataset length: 8000
Validation dataset length: 2000


In [24]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    logging_dir='/kaggle/working/logs',
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    dataloader_num_workers=0,  
    fp16=True,
    debug="underflow_overflow"
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  
    eval_dataset=val_dataset, 
    tokenizer=tokenizer
)

  trainer = Trainer(


In [26]:
import torch
print(torch.cuda.is_available())  
print(torch.cuda.get_device_name(0))  

True
Tesla P100-PCIE-16GB


In [27]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [28]:
!nvidia-smi

Fri Jan 17 16:30:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             31W /  250W |    1319MiB /  16384MiB |     17%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.388,0.370396
2,0.3381,0.363829
3,0.3687,0.361726
4,0.3951,0.362372
5,0.3755,0.360235


KeyboardInterrupt: 

In [30]:
torch.save(model.state_dict(), "model_weights.pth")

In [31]:
pip install huggingface_hub


Note: you may need to restart the kernel to use updated packages.


In [36]:
from huggingface_hub import login

login()  # This will prompt you to enter your token


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
from huggingface_hub import create_repo

repo_name = "persona_training"
create_repo(repo_name, exist_ok=True)

RepoUrl('https://huggingface.co/AbhitulyaHF/persona_training', endpoint='https://huggingface.co', repo_type='model', repo_id='AbhitulyaHF/persona_training')

In [40]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="/kaggle/working/",
    repo_id="AbhitulyaHF/persona_training"
)


events.out.tfevents.1737130838.e96ad5de7618.31.0:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

events.out.tfevents.1737131447.e96ad5de7618.31.1:   0%|          | 0.00/60.2k [00:00<?, ?B/s]

model_weights.pth:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Upload 13 LFS files:   0%|          | 0/13 [00:00<?, ?it/s]

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AbhitulyaHF/persona_training/commit/331a0365c22e01bcf1d4ac672c6909bf8ef95033', commit_message='Upload folder using huggingface_hub', commit_description='', oid='331a0365c22e01bcf1d4ac672c6909bf8ef95033', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AbhitulyaHF/persona_training', endpoint='https://huggingface.co', repo_type='model', repo_id='AbhitulyaHF/persona_training'), pr_revision=None, pr_num=None)

In [43]:
prompt = input()

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

# inference
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids, 
        max_new_tokens=50, 
        do_sample=True, 
        top_p=0.1,
        temperature=0.7
    )

outputs = outputs.detach().cpu().numpy()
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
output = outputs[0][len(prompt):]
print(output)

 Person B has the following Persona information.  Persona of Person B: My name is David and I'm a 35 year old math teacher. Persona of Person B: I like to hike and spend time in the nature. Persona of Person B: I'm married with two kids.  Instruct: Person A and Person B are now having a conversation.  Following the conversation below, write a response that Person B would say base on the above Persona information.  Please carefully consider the flow and context of the conversation below, and use the Person B's Persona information appropriately to generate a response that you think are  the most appropriate replying for Person B.  Persona A: Morning! I think I saw you at the parent meeting, what's your name?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
