# TRANSFER LEARNING ON TRANSFORMER TO GIVE INFO ABOUT THE COSTA RICAN DISH

## Fine-Tuning GPT-2 for Recipe Generation: Training and Analysis

## Dataset version 1

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import random 

# Set the device to GPU or Apple M1 (MPS) if available, otherwise CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Function to split the dataset
def split_dataset(filename, train_ratio=0.8):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read().split('---end-of-recipe---')

    random.shuffle(content)
    train_size = int(len(content) * train_ratio)
    train_data = content[:train_size]
    validation_data = content[train_size:]

    return train_data, validation_data

# Split the dataset and save it in different files
train_data, validation_data = split_dataset('../dataset-transformers/dishes_train_v1.txt')
train_filename = 'train_dataset_v1.txt'
validation_filename = 'validation_dataset_v1.txt'

with open(train_filename, 'w', encoding='utf-8') as f:
    f.write('---end-of-recipe---'.join(train_data))

with open(validation_filename, 'w', encoding='utf-8') as f:
    f.write('---end-of-recipe---'.join(validation_data))

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Ensure the tokenizer uses the correct pad token
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')

# Move model to the appropriate device
model.to(device)

# Custom Dataset class for recipes
class RecipeDataset(Dataset):
    def __init__(self, tokenizer, filename, block_size=128):
        self.tokenizer = tokenizer
        self.examples = []

        # Read and split the dataset file
        with open(filename, 'r', encoding='utf-8') as f:
            recipes = f.read().split('---end-of-recipe---')

        # Encode recipes and add to examples
        for recipe in recipes:
            if recipe.strip() == "":
                continue

            tokens = tokenizer.encode_plus(recipe, 
                                            add_special_tokens=True, 
                                            max_length=block_size, 
                                            padding='max_length', 
                                            truncation=True, 
                                            return_tensors='pt')

            self.examples.append(tokens)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # Get individual item from dataset
        input_ids = self.examples[i]['input_ids'][0]
        attention_mask = self.examples[i]['attention_mask'][0]
        labels = input_ids.clone() # Labels for language modeling
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Initialize dataset and dataloader
dataset = RecipeDataset(tokenizer, '../dataset-transformers/dishes_train_v1.txt')

# Create a DataLoader
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)


# Define training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir='./gpt2_finetuned_recipes_v1',
    num_train_epochs=20,
    per_device_train_batch_size=2,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=50,
)

# Load validation dataset
validation_dataset = RecipeDataset(tokenizer, validation_filename)

# Initialize trainer for model fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=validation_dataset
)

# Train the model
trainer.train()

# Print log history
print(trainer.state.log_history)

# Extract and print training loss
training_loss_run1 = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
validation_loss_v1 = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]
print(training_loss_run1)
print(validation_loss_v1)


### Performance metrics dataset version 1

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(training_loss_run1, label='Training Loss GPT-2 dataset 1', color='#696969', linewidth=3.5)
plt.plot(validation_loss_v1, label='Validation Loss GPT-2 dataset 1', color='#C0C0C0', linewidth=3.5)
plt.title('Training and Validation Loss Over Time')
plt.xlabel('Logging Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# clean gpu cache
torch.mps.empty_cache()

## Dataset version 2

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# Set the device to GPU or Apple M1 (MPS) if available, otherwise CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Function to split the dataset
def split_dataset(filename, train_ratio=0.8):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read().split('---end-of-recipe---')

    random.shuffle(content)
    train_size = int(len(content) * train_ratio)
    train_data = content[:train_size]
    validation_data = content[train_size:]

    return train_data, validation_data

# Split the dataset and save it in different files
train_data, validation_data = split_dataset('../dataset-transformers/dishes_train_v2.txt')
train_filename = 'train_dataset_v2.txt'
validation_filename = 'validation_dataset_v2.txt'

with open(train_filename, 'w', encoding='utf-8') as f:
    f.write('---end-of-recipe---'.join(train_data))

with open(validation_filename, 'w', encoding='utf-8') as f:
    f.write('---end-of-recipe---'.join(validation_data))

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Ensure the tokenizer uses the correct pad token
tokenizer.pad_token = tokenizer.eos_token # Set the pad token

model = GPT2LMHeadModel.from_pretrained('gpt2')

# Move model to the appropriate device
model.to(device)

# Custom Dataset class for recipes
class RecipeDataset(Dataset):
    def __init__(self, tokenizer, filename, block_size=128):
        self.tokenizer = tokenizer
        self.examples = []

        # Read and split the dataset file
        with open(filename, 'r', encoding='utf-8') as f:
            recipes = f.read().split('---end-of-recipe---')

        # Process each recipe
        for recipe in recipes:
            recipe = recipe.strip()
            if recipe == "":
                continue

            # Splitting input and output
            parts = recipe.split('\nOutput: ')
            if len(parts) != 2:
                continue  # Skip if the format is not correct

            input_text, output_text = parts
            full_text = input_text + " " + output_text  # Combine input and output

            tokens = tokenizer.encode_plus(full_text,
                                           add_special_tokens=True,
                                           max_length=block_size,
                                           padding='max_length',
                                           truncation=True,
                                           return_tensors='pt')

            self.examples.append(tokens)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # Get individual item from dataset
        input_ids = self.examples[i]['input_ids'][0]
        attention_mask = self.examples[i]['attention_mask'][0]
        labels = input_ids.clone() # Labels for language modeling
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Initialize dataset and dataloader
dataset = RecipeDataset(tokenizer, '../dataset-transformers/dishes_train_v2.txt')

# Create a DataLoader
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir='./gpt2_finetuned_recipes_v2',
    num_train_epochs=20,
    per_device_train_batch_size=2,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=50,
)

# Load validation dataset
validation_dataset = RecipeDataset(tokenizer, validation_filename)

# Initialize trainer for model fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=validation_dataset
)

# Train the model
trainer.train()

# Print log history
print(trainer.state.log_history)

# Extract and print training loss
training_loss_run2 = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
validation_loss_v2 = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]
print(training_loss_run2)
print(validation_loss_v2)


### Performance metrics dataset version 2

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(training_loss_run2, label='Training Loss GPT-2 dataset 2', color='#696969', linewidth=3.5)
plt.plot(validation_loss_v2, label='Validation Loss GPT-2 dataset 2', color='#C0C0C0', linewidth=3.5)
plt.title('Training and Validation Loss Over Time')
plt.xlabel('Logging Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

### Combination of the performance of both datasets

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(training_loss_run1, label='Training Loss GPT-2 dataset 1', linewidth=3.5)
plt.plot(training_loss_run2, label='Training Loss GPT-2 dataset 2', linewidth=3.5)
plt.plot(validation_loss_v1, label='validation Loss GPT-2 dataset 1', linewidth=3.5)
plt.plot(validation_loss_v2, label='validation Loss GPT-2 dataset 2', linewidth=3.5)
plt.title('Comparison of Training Losses')
plt.xlabel('Logging Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# clean gpu cache
torch.mps.empty_cache()

# TRANSFER LEARNING ON TRANSFORMER TO GIVE INFO ABOUT THE COSTA RICAN DISH

## Fine-Tuning GPT-neo-125M for Recipe Generation: Training and Analysis

## Dataset version 1

In [None]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
import random
import matplotlib.pyplot as plt

# Set the device to GPU or CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Function to split the dataset
def split_dataset(filename, train_ratio=0.8):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read().split('---end-of-recipe---')

    random.shuffle(content)
    train_size = int(len(content) * train_ratio)
    train_data = content[:train_size]
    validation_data = content[train_size:]

    return train_data, validation_data

# Split and save the dataset
train_data, validation_data = split_dataset('../dataset-transformers/dishes_train_v1.txt')
train_filename = 'train_dataset-gptneo-v1.txt'
validation_filename = 'validation_dataset-gptneo-v1.txt'

with open(train_filename, 'w', encoding='utf-8') as f:
    f.write('---end-of-recipe---'.join(train_data))

with open(validation_filename, 'w', encoding='utf-8') as f:
    f.write('---end-of-recipe---'.join(validation_data))

# Load the GPT-2 tokenizer and GPT-Neo model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

model.to(device)

# Custom Dataset class for recipes
class RecipeDataset(Dataset):
    def __init__(self, tokenizer, filename, block_size=128):
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token  # Set padding token
        self.examples = []

        with open(filename, 'r', encoding='utf-8') as f:
            recipes = f.read().split('---end-of-recipe---')

        for recipe in recipes:
            recipe = recipe.strip()
            if recipe == "":
                continue

            tokens = tokenizer.encode_plus(recipe,
                                           add_special_tokens=True,
                                           max_length=block_size,
                                           padding='max_length',
                                           truncation=True,
                                           return_tensors='pt')

            self.examples.append(tokens)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        input_ids = self.examples[i]['input_ids'][0]
        attention_mask = self.examples[i]['attention_mask'][0]
        labels = input_ids.clone()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Initialize datasets and dataloaders
train_dataset = RecipeDataset(tokenizer, train_filename)
validation_dataset = RecipeDataset(tokenizer, validation_filename)

# Define training arguments
#training_args = TrainingArguments(
#    output_dir='./gpt_neo_finetuned_recipes',
#    num_train_epochs=10,
#    per_device_train_batch_size=2,
#    logging_strategy="epoch",
#    save_steps=50,
#    evaluation_strategy="epoch",
#)
training_args = TrainingArguments(
    output_dir='./gpt_neo_finetuned_recipes_v1',
    num_train_epochs=20,
    per_device_train_batch_size=2,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=50,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

# Train the model
trainer.train()

# Extract and print training and validation loss
training_loss = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
validation_loss = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]
print("Training Loss:", training_loss)
print("Validation Loss:", validation_loss)

### performance metrics EleutherAI/gpt-neo-125M dataset 1

In [None]:

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(training_loss, label='Training Loss GPT-Neo 125M dataset 1', linewidth=3.5)
plt.plot(validation_loss, label='Validation Loss GPT-Neo 125M dataset 1', linewidth=3.5)
plt.title('Comparison of Training Losses')
plt.xlabel('Logging Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# clean gpu cache
torch.mps.empty_cache()

## Dataset version 2

In [None]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import random
import matplotlib.pyplot as plt

# Set the device to GPU or CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Function to split the dataset
def split_dataset(filename, train_ratio=0.8):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read().split('---end-of-recipe---')

    random.shuffle(content)
    train_size = int(len(content) * train_ratio)
    train_data = content[:train_size]
    validation_data = content[train_size:]

    return train_data, validation_data

# Split and save the dataset
train_data, validation_data = split_dataset('../dataset-transformers/dishes_train_v2.txt')
train_filename = 'train_dataset-gptneo-v2.txt'
validation_filename = 'validation_dataset-gptneo-v2.txt'

with open(train_filename, 'w', encoding='utf-8') as f:
    f.write('---end-of-recipe---'.join(train_data))

with open(validation_filename, 'w', encoding='utf-8') as f:
    f.write('---end-of-recipe---'.join(validation_data))

# Load the GPT-2 tokenizer and GPT-Neo model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Setting pad token to eos token as GPT-2 does not have a native pad token.
tokenizer.pad_token = tokenizer.eos_token

model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')
model.to(device)

# Custom Dataset class for recipes
class RecipeDataset(Dataset):
    def __init__(self, tokenizer, filename, block_size=128):
        self.tokenizer = tokenizer
        self.examples = []

        # Read and split the dataset file
        with open(filename, 'r', encoding='utf-8') as f:
            recipes = f.read().split('---end-of-recipe---')

        # Process each recipe
        for recipe in recipes:
            recipe = recipe.strip()
            if recipe == "":
                continue

            # Splitting input and output
            parts = recipe.split('\nOutput: ')
            if len(parts) != 2:
                continue  # Skip if the format is not correct

            input_text, output_text = parts
            full_text = input_text + " " + output_text  # Combine input and output

            tokens = tokenizer.encode_plus(full_text,
                                           add_special_tokens=True,
                                           max_length=block_size,
                                           padding='max_length',
                                           truncation=True,
                                           return_tensors='pt')

            self.examples.append(tokens)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # Get individual item from dataset
        input_ids = self.examples[i]['input_ids'][0]
        attention_mask = self.examples[i]['attention_mask'][0]
        labels = input_ids.clone() # Labels for language modeling
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }


# Initialize datasets and dataloaders
train_dataset = RecipeDataset(tokenizer, train_filename)
validation_dataset = RecipeDataset(tokenizer, validation_filename)

training_args = TrainingArguments(
    output_dir='./gpt_neo_finetuned_recipes_v2',
    num_train_epochs=20,
    per_device_train_batch_size=2,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=50,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

# Train the model
trainer.train()

# Extract and print training and validation loss
training_loss_gptneo_v2 = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
validation_loss_gptneo_v2 = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]
print("Training Loss:", training_loss_gptneo_v2)
print("Validation Loss:", validation_loss_gptneo_v2)

### Combination of the performance of both datasets

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(training_loss_gptneo_v2, label='Training Loss GPT-Neo 125M dataset 2', linewidth=3.5)
plt.plot(validation_loss_gptneo_v2, label='Validation Loss GPT-Neo 125M dataset 2', linewidth=3.5)
plt.plot(training_loss, label='Training Loss GPT-Neo 125M dataset 1', linewidth=3.5)
plt.plot(validation_loss, label='Validation Loss GPT-Neo 125M dataset 1', linewidth=3.5)
plt.title('Comparison of Training Losses')
plt.xlabel('Logging Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()