In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('reduced_recipe.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
import pandas as pd

# Function to keep only strings
def keep_only_strings(value):
    return value if isinstance(value, str) else None

# Apply function to both columns
df["input_text"] = df["input_text"].apply(keep_only_strings)
df["target_text"] = df["target_text"].apply(keep_only_strings)

# Drop rows where any column has NaN (removed non-string values)
df.dropna(inplace=True)

# Reset index after cleaning
df.reset_index(drop=True, inplace=True)

In [None]:
from datasets import Dataset

In [None]:
dataset = Dataset.from_pandas(df[["input_text", "target_text"]])

In [None]:
print (dataset)

In [None]:
train_test_split = dataset.train_test_split(test_size=0.1)

In [None]:
train_valid_split = train_test_split["train"].train_test_split(test_size=0.1111)

In [None]:
dataset = {
    "train": train_valid_split["train"],  # 80% of total data
    "validation": train_valid_split["test"],  # 10% of total data
    "test": train_test_split["test"],  # 10% of total data
}


In [None]:
print(f"Train size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    return tokenizer(
        examples["input_text"], 
        text_target=examples["target_text"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )

# Ensure dataset is a Hugging Face DatasetDict
from datasets import DatasetDict

dataset = DatasetDict({
    "train": dataset["train"],
    "validation": dataset["validation"],
    "test": dataset["test"]
})

# Tokenize the dataset correctly
tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [None]:
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer

import torch

device = torch.device("cuda")

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./t5_recipe_model",  # Directory to save model checkpoints
    evaluation_strategy="epoch",      # Evaluate at the end of each epoch
    save_strategy="epoch",            # Save model at each epoch
    logging_dir="./logs",             # Directory for logging

    per_device_train_batch_size=4,    # Reduced batch size to fit 6GB VRAM
    per_device_eval_batch_size=4,     # Match train batch size for consistency

    num_train_epochs=5,               # Increase epochs since batch size is smaller
    learning_rate=3e-5,               # Slightly higher LR for better convergence
    weight_decay=0.01,                # Regularization to avoid overfitting

    fp16=True,                        # Enable mixed precision for faster training
    gradient_accumulation_steps=2,    # Accumulate gradients to simulate larger batch

    save_total_limit=3,               # Keep 3 latest checkpoints
    load_best_model_at_end=True,      # Load best model based on evaluation metric
)


In [None]:
# Move model to GPU for training
model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)
def evaluate_on_cpu(model, eval_dataloader):
    model.to("cpu")  # Move model to CPU
    results = trainer.evaluate()
    model.to(device)  # Move model back to GPU for training
    return results


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Path to your saved model
model_path = "./t5_recipe_model"

# Load the fine-tuned model
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")

In [None]:
# Example input 
input_text = "generate recipe: tomatoes, onions, garlic, pasta"

# Tokenize input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate response
output_ids = model.generate(input_ids)

# Decode output
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Recipe:", generated_text)
