#### Fine-Tuning the GPT2 model

In [2]:
import pandas as pd

In [3]:
dataset = pd.read_csv('../dataset/recipes.csv')

In [6]:
# Converting from R-style list to Python List
def to_python_list(r_list_string):
    if(isinstance(r_list_string, str)):
        return r_list_string.strip('c()').replace('"', '').split(', ')
    return []

In [7]:
dataset["RecipeIngredientParts"] = dataset["RecipeIngredientParts"].apply(to_python_list)
dataset["RecipeInstructions"] = dataset["RecipeInstructions"].apply(to_python_list)

> GPT-2 learns where a sample starts and ends.  
> We consistently format every recipe this way, the model can learn that structure, and we can prompt it with the same structure again later on.

In [9]:
# Formatting each row 
def format_sample(row):
    name = row['Name'] if pd.notna(row['Name']) else 'Unnamed Recipe'
    ingredients = ', '.join(row['RecipeIngredientParts'])
    instructions = '\n'.join(row['RecipeInstructions'])

    return (
        f"<|startoftext|>\n"
        f"Name: {name}\n"
        f"Ingredients: {ingredients}\n"
        f"Instructions:\n{instructions}\n"
        f"<|endoftext|>\n"
    )

In [10]:
# Creating a new dataset for fine-tuning
with open('../dataset/gpt2_recipes.txt', 'w', encoding='utf-8') as f:
    for _, row in dataset.iterrows():
        try:
            text = format_sample(row)
            f.write(text)
        except Exception as e:
            continue