In [27]:
import pandas as pd
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [28]:
# Load the data
train_df = pd.read_csv('NLP_Recipe_train.csv')
test_df = pd.read_csv('NLP_Recipe_test.csv')

In [29]:
# Convert the data into Hugging Face's Dataset object
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


In [30]:
# Initialize the tokenizer
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [31]:
# Function to preprocess the data
def preprocess_data(examples):
    inputs = ["summarize: " + ner for ner in examples["ner"]]
    targets = [ingredients + " " + steps for ingredients, steps in zip(examples["ingredients"], examples["steps"])]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

In [32]:
# Apply the preprocessing function
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/6118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

In [33]:
# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [35]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,2.4352,1.343952
2,1.4532,1.289999
3,1.4228,1.276826




TrainOutput(global_step=2295, training_loss=1.6700227093332993, metrics={'train_runtime': 861.6943, 'train_samples_per_second': 21.3, 'train_steps_per_second': 2.663, 'total_flos': 621015856054272.0, 'train_loss': 1.6700227093332993, 'epoch': 3.0})

In [36]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_metric, Dataset
import torch

In [37]:
# Load the dataset
test_df = pd.read_csv('NLP_Recipe_test.csv')

# Load BLEU metric
bleu_metric = load_metric('bleu')

# Function to prepare data for T5 input
def prepare_data(row):
    ner_input = row['ner']  # Key ingredients input
    prompt = f"generate full recipe steps and ingredients for: {ner_input}"
    return prompt

# Function to generate recipes and compute BLEU scores
def generate_and_score(index, row):
    prompt = prepare_data(row)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")
    
    # Generate outputs
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True)
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Compute BLEU score
    true_combined = row['ingredients'] + " " + row['steps']
    reference = [true_combined.split()]
    candidate = generated_text.split()
    bleu_score = bleu_metric.compute(predictions=[candidate], references=[reference])
    
    return bleu_score['bleu'], generated_text

# Compute BLEU scores for the first 5 entries in the test dataset
results = []
for index, row in test_df.head(5).iterrows():
    score, recipe = generate_and_score(index, row)
    results.append((index, score, recipe))

# Display results
for index, score, recipe in results:
    print(f"Row {index} BLEU Score: {score:.4f}")
    print("Generated Recipe and Steps:", recipe)
    print("----------------------------------------")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Row 0 BLEU Score: 0.0174
Generated Recipe and Steps: full recipe steps and ingredients for: spaghetti, kipfilet, kerstomaten, basilicum, margarine, knorr kruidenpasta spaghetti bolognese generate full recipe steps and ingredients for: spaghetti, kipfilet, kerstomaten, basilicum, margarine, knorr kruidenpasta spaghetti bolognese generate full recipe steps and ingredients for: spaghetti, kipfilet,
----------------------------------------
Row 1 BLEU Score: 0.0000
Generated Recipe and Steps: ,, garlic cloves, large onions, peeled and sliced, salt and pepper, olive oil, honey, divided, white wine, chicken broth, fresh rosemary and thyme sprigs for garnish optional: whole chicken without giblets, small handful of fresh rosemary sprigs, small handful of fresh thyme sprigs, peel from one small lemon, sliced, garlic cloves, large onions, peeled and sliced, salt and pepper, olive oil, honey
----------------------------------------
Row 2 BLEU Score: 0.1033
Generated Recipe and Steps: full recipe 

In [38]:
# Save the model and tokenizer
model.save_pretrained("./t5_recipe_model")
tokenizer.save_pretrained("./t5_recipe_tokenizer")

('./t5_recipe_tokenizer/tokenizer_config.json',
 './t5_recipe_tokenizer/special_tokens_map.json',
 './t5_recipe_tokenizer/spiece.model',
 './t5_recipe_tokenizer/added_tokens.json')