# Setup the Environment

In [3]:
!pip install accelerate transformers[torch] datasets pandas

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Us

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

In [2]:

df = pd.read_csv('personality.csv')

df['text'] = 'Persona: ' + df['Persona'].astype(str) + ' Chat: ' + df['chat'].astype(str)

# Create a Hugging Face Dataset from the DataFrame
dataset = Dataset.from_pandas(df[['text']])


# Load the GPT-2 Tokenizer

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Tokenize the Dataset

In [4]:
def tokenize_function(examples):
    tokens = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    tokens['labels'] = tokens['input_ids'].copy()
    return tokens

# Apply the tokenization function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5860 [00:00<?, ? examples/s]

In [5]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the padding token ID in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Define Training Arguments

In [6]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

Create a Trainer Instance

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

In [8]:
trainer.train()

Step,Training Loss
500,1.386
1000,1.2813
1500,1.251
2000,1.1961
2500,1.1702
3000,1.1636
3500,1.1319
4000,1.1299


TrainOutput(global_step=4395, training_loss=1.2074559477544617, metrics={'train_runtime': 2738.0897, 'train_samples_per_second': 6.421, 'train_steps_per_second': 1.605, 'total_flos': 4593513922560000.0, 'train_loss': 1.2074559477544617, 'epoch': 3.0})

Save the Fine-Tuned Model and Tokenizer

In [9]:
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

Create the Chatbot Interface

In [10]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')

# Create a text generation pipeline
chatbot = pipeline('text-generation', model=model, tokenizer=tokenizer)

def chat_with_bot():
    print("Welcome to the chatbot! Type 'exit' to end the conversation.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            print("Bot: Goodbye!")
            break
        # Generate a response
        response = chatbot("Persona: Your persona here. Chat: " + user_input, max_length=150, num_return_sequences=1)
        print("Bot:", response[0]['generated_text'])

if __name__ == "__main__":
    chat_with_bot()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Welcome to the chatbot! Type 'exit' to end the conversation.
You: Kalyan


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Bot: Persona: Your persona here. Chat: Kalyan tiger is my fave character i love anime with anime
wow. i love the tigers. i like to watch anime with them
i love the tigers too, but they are more interesting when it comes to character animation.
they are nice, but i love other people too
oh ok then i have to put up with kalyan tigers.
are you still a fan of your favorite show?
no not really, but i have been watching them alot of times.
what do you do for a living?
i am still in school to become a writer.
that is amazing, my career is pretty much going for the dollar
yes its something i am doing
You: how are you?
Bot: Persona: Your persona here. Chat: how are you?
i am alright. sorry to hear that.
i feel bad doing that. i have to keep my eyes on the busy life i just started.
why not? the only job i have is to make cars and not work
i have a friend, and she wants a career.
oh, i enjoy it, i love it. i work in an office.
i like cars that are easy to drive.
that is true. there are a lot of f

KeyboardInterrupt: Interrupted by user

In [11]:
chat_with_bot()

Welcome to the chatbot! Type 'exit' to end the conversation.
You: how are you?
Bot: Persona: Your persona here. Chat: how are you?
not good and you?
not bad. i am very tired. you?
i am just fine.
you do not need to be tired for that.
just keep your mind right. when you get tired, you can make a play.
i do enjoy that. i do not play sports much.
i know, what hobbies do you have?
i enjoy reading. i live near a lot of authors. the author is my favorite.
i really love reading. do you read at all?
sometimes, but only for the first month. you?
i prefer reading to doing a lot of other things.
really? what about music?
You: exit
Bot: Goodbye!
