In [1]:
# Import necessary libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

1. Data Collection

In [2]:
# Define training data
train_data = [
    {"input": "How is the weather today?", "output": "The weather today is sunny."},
    {"input": "Tell me a joke.", "output": "Why did the chicken cross the road?"},
    {"input": "What's the latest news?", "output": "I'm not sure, would you like me to look it up for you?"},
    {"input": "Recommend a good book.", "output": "I recommend 'The Great Gatsby' by F. Scott Fitzgerald."},
    {"input": "What's the latest news?", "output": "I'm not sure, let me check..."},
    {"input": "Give me a fun fact.", "output": "Did you know that honey never spoils?"},
    # Add more examples for better accuracy
]

2. Data Preprocessing

In [3]:
# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [4]:
# Tokenize training data
tokenized_data = []
for example in train_data:
    input_tokens = tokenizer.encode(example["input"], add_special_tokens=False)
    output_tokens = tokenizer.encode(example["output"], add_special_tokens=False)
    tokenized_data.append({"input_ids": input_tokens, "output_ids": output_tokens})

3. Model Setup

In [5]:
# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [6]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=1000,
    save_total_limit=2,
    logging_dir="./logs",
)

In [7]:
# Set up data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


4. Fine-Tuning Loop

In [8]:
# Create a Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_data,
)

In [9]:
# Start fine-tuning
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
trainer.train()

Step,Training Loss


TrainOutput(global_step=10, training_loss=2.2405675888061523, metrics={'train_runtime': 106.614, 'train_samples_per_second': 0.281, 'train_steps_per_second': 0.094, 'total_flos': 90839808000.0, 'train_loss': 2.2405675888061523, 'epoch': 5.0})

5. Evaluation

In [10]:
# Define validation data
validation_data = [
    {"input": "How are you?", "ground_truth": "I'm doing well, thank you!"},
    {"input": "Tell me a joke.", "ground_truth": "Why did the chicken cross the road?"},
    # Add more examples
]

In [11]:
# Evaluate the fine-tuned model
eval_results = []
for example in validation_data:
    input_text = example["input"]
    ground_truth = example["ground_truth"]

    # Generate text using the fine-tuned model
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    generated_ids = model.generate(input_ids, max_length=50, num_return_sequences=1)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # Calculate match score using a more sophisticated metric if needed
    match_score = int(generated_text == ground_truth)

    eval_results.append({"input": input_text, "generated_text": generated_text, "match_score": match_score})

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [12]:
# Calculate and print evaluation metrics
total_examples = len(eval_results)
total_matches = sum(result["match_score"] for result in eval_results)
accuracy = total_matches / total_examples
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 0.00%


6. Deployment

In [14]:
# Deployment code (simplified)
while True:
    user_input = input("User: ")

    # Tokenize user input
    input_ids = tokenizer.encode(user_input, return_tensors="pt")

    # Generate a response using the fine-tuned model
    generated_response = model.generate(input_ids, max_length=50, num_return_sequences=1)
    generated_response = tokenizer.decode(generated_response[0], skip_special_tokens=True)

    print(f"Bot: {generated_response}")

User: Hi there


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot: Hi there, I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry
User: why?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot: why? I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not
User: Hi


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot: Hi, I'm a little bit of a fan of the "I'm a little bit of a fan of the "I'm a little bit of a fan of the "I'm a little bit of a fan of the "I'm a little


KeyboardInterrupt: Interrupted by user