## FINETUNE GPT-02 on Shakespear Dataset

#### Install necessary packages

In [1]:
!pip install transformers
!pip install -U datasets==2.17.1

Collecting datasets==2.17.1
  Downloading datasets-2.17.1-py3-none-any.whl.metadata (20 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.17.1)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.2.0
    Uninstalling fsspec-2024.2.0:
      Successfully uninstalled fsspec-2024.2.0
  Attempting uninstall: datasets
    Found existing installation: datasets 2.18.0
    Uninstalling datasets-2.18.0:
      Successfully uninstalled dataset

#### Download shakespeare dataset / upload already split dataset on colab

In [None]:
#import gpt_2_simple as gpt2
import os
import requests

In [6]:
# download the data and manually split it into two train and test sets  or other wise use directly

file_name = "shakespeare.txt"
# with open("demofile.txt", "r"):
#     print(f.read())

if not os.path.isfile(file_name):
	url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
	data = requests.get(url)

	with open(file_name, 'w') as f:
		f.write(data.text)


#### Fine-Tune and Train GPT-02

In [4]:
import torch
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

2024-04-09 20:15:36.574947: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-09 20:15:36.575100: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-09 20:15:36.742615: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# Load the GPT-2 model and tokenizer
model_name = "gpt2"
config = GPT2Config.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Define the dataset for training
train_file = "/content/shakespeare_test_data.txt"
test_file = "/content/shakespeare_train_data.txt"

def load_dataset(train_file, test_file, tokenizer):
    train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_file, block_size=128)
    test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_file, block_size=128)
    return train_dataset, test_dataset

In [None]:
train_dataset, test_dataset = load_dataset(train_file, test_file, tokenizer)

# Define the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
# Train the model
trainer.train()

# Save the fine-tuned model
#trainer.save_model("fine_tuned_gpt2")


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]






Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,3.921725
2,No log,3.941945
3,No log,3.966987


#### Save the model as well as te tokenizer

In [None]:
trainer.save_model("fine_tuned_gpt2_shakespeare")

tokenizer.save_pretrained("fine_tuned_gpt2_shakespeare")

('fine_tuned_gpt2_shakespeare/tokenizer_config.json',
 'fine_tuned_gpt2_shakespeare/special_tokens_map.json',
 'fine_tuned_gpt2_shakespeare/vocab.json',
 'fine_tuned_gpt2_shakespeare/merges.txt',
 'fine_tuned_gpt2_shakespeare/added_tokens.json')

#### Test the fine tuned model

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned GPT-2 model and tokenizer
model_name = "fine_tuned_gpt2_shakespeare"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Function to generate responses
def generate_response(prompt_text, model, tokenizer, max_length=50, num_return_sequences=1):
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt")

    # Generate response
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_p=0.9,
    )

    # Decode the generated responses
    responses = []
    for response_id in output_sequences:
        response = tokenizer.decode(response_id, skip_special_tokens=True)
        responses.append(response)

    return responses



In [None]:
# Test the model with a prompt
prompt_text = "Her sister Katharina welcomed you withal?"
responses = generate_response(prompt_text, model, tokenizer)

for response in responses:
    print(response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Her sister Katharina welcomed you withal?



KATHARINA:

I am glad to hear you.

HORTENSIO: (To Katharine, who is in the company of the Duke of York, and is a
