In [None]:
!pip install datasets



In [None]:
import json
from datasets import Dataset
from transformers import Trainer, TrainingArguments, GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer

# Load dataset
with open("preprocessed_data.json", "r") as f:
    data = json.load(f)



# Initially load the "gpt2-medium" model to cross train
# Load previously trained weights from ./gpt2-finetuned-movie-dialog To resume training
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
model = AutoModelForCausalLM.from_pretrained("gpt2-medium")

# Prepare the dataset in the correct format
formatted_data = []
for conversation in data:
    input_ids = conversation[0]
    response_ids = conversation[1]

    # Concatenate input and response with a separator (if needed)
    combined_ids = input_ids + [tokenizer.eos_token_id] + response_ids

    formatted_data.append({
        "input_ids": combined_ids,
        "attention_mask": [1] * len(combined_ids),
        "labels": combined_ids  # Labels are the same as input_ids (GPT-2 copies the input)
    })

# Convert to a Huggingface Dataset
dataset = Dataset.from_list(formatted_data)
# Load your dataset and split it
dataset_split = dataset.train_test_split(test_size=0.05)  # 95% train, 5% validation

# Separate the train and validation datasets
train_dataset = dataset_split["train"]
val_dataset = dataset_split["test"]
print(train_dataset[0])  # Verify the format



{'input_ids': [568, 545, 407, 1016, 284, 1487, 534, 12046, 319, 262, 826, 780, 345, 260, 5543, 3376, 262, 826, 389, 32627, 16196, 290, 22388, 1128, 34718, 2592, 618, 2045, 379, 262, 31082, 1312, 765, 284, 11508, 345, 326, 428, 318, 2081, 286, 1111, 5389, 329, 262, 6573, 7515, 262, 1364, 468, 7953, 287, 7619, 870, 661, 4395, 661, 3503, 612, 743, 407, 307, 867, 7040, 475, 407, 422, 3092, 286, 2111, 611, 428, 318, 588, 15774, 661, 319, 17044, 290, 256, 15566, 1312, 17666, 18869, 766, 534, 17044, 393, 256, 15566, 21318, 355, 329, 262, 5654, 7515, 257, 1256, 286, 7272, 50256, 5562, 11543, 4519, 318, 12361, 475, 703, 318, 326, 5884, 284, 262, 1364, 698, 76, 373, 429, 2950, 262, 4395, 286, 1644, 3790, 373, 257, 13574, 475, 262, 30091, 2950, 14005, 287, 7208, 284, 644, 262, 3146, 286, 24270, 15102, 852, 2823, 416, 1644, 423, 1716, 355, 329, 1034, 11392, 345, 24628, 607, 329, 33064, 4234, 9056, 475, 19997, 318, 852, 33064, 4234, 826, 783, 290, 339, 373, 7018, 1893, 1034, 11392, 318, 407, 772, 3

In [None]:
import shutil
from google.colab import drive
from transformers import pipeline

drive.mount('/content/drive')

for epoch in range(10):
  training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,  # Further reduced batch size if needed
    save_steps=300,
    save_total_limit=2,
    logging_dir="./logs",
    eval_strategy="steps",  # Evaluate every 'eval_steps'
    eval_steps=300,  # Evaluate every __ steps
    logging_steps=300,  # Log training and validation loss every __ steps
  )

  # Initialize the Trainer with train and validation datasets
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=dataset,  # Training dataset
      eval_dataset=val_dataset,  # Validation dataset
  )

  # Start training
  trainer.train()

  model.save_pretrained("./gpt2-finetuned-movie-dialog")
  tokenizer.save_pretrained("./gpt2-finetuned-movie-dialog")

  # Zip the directory
  shutil.make_archive(f'gpt2-finetuned-movie-dialog', 'zip', './gpt2-finetuned-movie-dialog')
  !cp -r ./gpt2-finetuned-movie-dialog.zip /content/drive/MyDrive/
  print(f"Epoch: {epoch} saved to drive" )
  epoch += 1

  # Load fine-tuned model
  generator = pipeline("text-generation", model="./gpt2-finetuned-movie-dialog", tokenizer=tokenizer)

  # Test the model
  response = generator(
      "You are a friendly chatbot specializing in movie dialog. \nUser: What's your favorite movie? \nChatBot: ",
      max_length=200,
      num_return_sequences=1,
      min_new_tokens = 5
  )
  print(response[0]['generated_text'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Step,Training Loss,Validation Loss
300,2.3861,2.1826
600,2.2228,2.099441
900,2.1585,2.04674
1200,2.1366,2.012127
1500,2.1063,1.98236
1800,2.0928,1.964738
2100,2.0797,1.95573


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 0 saved to drive
You are a friendly chatbot specializing in movie dialog. 
User: What's your favorite movie? 
ChatBot: ive watched a few i think there are some movies that i would definitely feel good about watching but youve already answered this question your choices are pretty obvious anyway gt i thought the movie was about a man who has a difficult time getting past his mental issues by having the support of his wife and kid gtbut it ends up seeming like a pretty shallow movie especially


Step,Training Loss,Validation Loss
300,1.9824,1.940645
600,1.9429,1.908252
900,1.9274,1.878151
1200,1.9439,1.854815
1500,1.9476,1.836862
1800,1.9618,1.824348
2100,1.9743,1.817998


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 1 saved to drive
You are a friendly chatbot specializing in movie dialog. 
User: What's your favorite movie? 
ChatBot: ive watched a lot i think there are some movies that i would definitely feel very comfortable with seeing when i am not a big movie fan but none of them would feel the same with my wife over!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


Step,Training Loss,Validation Loss
300,1.7397,1.820507
600,1.7258,1.800269
900,1.7378,1.769617
1200,1.7847,1.747262
1500,1.8188,1.729788
1800,1.8629,1.719302
2100,1.903,1.715846


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 2 saved to drive
You are a friendly chatbot specializing in movie dialog. 
User: What's your favorite movie? 
ChatBot: ive watched a lot i think there are some movies that i would personally feel very uncomfortable with seeing when i am not a fan of the subject matter also i dont think the same could be said for most of the medium nowadays people are too busy talking to look for any sort of entertainment while consuming the entertainment that they are consuming this is not a big problem in the digital


Step,Training Loss,Validation Loss
300,1.4855,1.746999
600,1.4984,1.742814
900,1.5427,1.697344
1200,1.6269,1.668061
1500,1.6978,1.646585
1800,1.7772,1.636982
2100,1.8483,1.637164


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 3 saved to drive
You are a friendly chatbot specializing in movie dialog. 
User: What's your favorite movie? 
ChatBot: ive never been a big movie fiend and i wasnt raised to do so movies with strong female leads and female characters that arent forced to be relatable to the same amount as white men!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


Step,Training Loss,Validation Loss
300,1.2218,1.731229
600,1.2648,1.742235
900,1.339,1.671264
1200,1.4631,1.618021
1500,1.5774,1.584759
1800,1.6978,1.572832
