**Import Libraries**

In [None]:
from google.colab import files
from google.colab import drive
import json
import pandas as pd

**Download the dataset from the kaggle**

In [None]:
# ! pip install -q kaggle
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
#!/bin/bash
!kaggle datasets download -d paultimothymooney/recipenlg
!unzip recipenlg.zip

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/recipenlg
License(s): other
Downloading recipenlg.zip to /content
 97% 625M/643M [00:08<00:00, 74.9MB/s]
100% 643M/643M [00:08<00:00, 80.1MB/s]
Archive:  recipenlg.zip
  inflating: RecipeNLG_code/README.md  
  inflating: RecipeNLG_code/eval/evaluation.ipynb  
  inflating: RecipeNLG_code/generation/README.md  
  inflating: RecipeNLG_code/generation/SAMPLE.md  
  inflating: RecipeNLG_code/generation/dataset2text.ipynb  
  inflating: RecipeNLG_code/generation/preparation.py  
  inflating: RecipeNLG_code/generation/run_generation.py  
  inflating: RecipeNLG_code/generation/run_lm_finetuning.py  
  inflating: RecipeNLG_code/generation/run_lm_finetuning_new.py  
  inflating: RecipeNLG_code/generation/run_lm_finetuning_tpu.py  
  inflating: RecipeNLG_code/generation/tokenization.py  
  inflating: RecipeNLG_code/ner/Language2_0.ipynb  
  inflating: RecipeNLG_code/ner/NER.ipynb  
  inflating: Recipe

**Read the dataset**

In [None]:
dataset = pd.read_csv('RecipeNLG_dataset.csv')

In [None]:
len(dataset)

2231142

**Since the dataset is huge we have taken only 25000 samples so it will be easy to train.**

In [None]:
dataset = dataset[:25000]

In [None]:
len(dataset)

25000

In [None]:
dataset.columns

Index(['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source',
       'NER'],
      dtype='object')

**Removing Unnecessary columns from the dataset**

In [None]:
dataset = dataset.drop(["Unnamed: 0", "link", "source"], axis=1)

In [None]:
dataset.isnull().any()

Unnamed: 0,0
title,False
ingredients,False
directions,False
NER,False


**Remove the data with same recipe title**

In [None]:
duplicate_titles = dataset[dataset['title'].duplicated()]
if not duplicate_titles.empty:
    print(f"Found {len(duplicate_titles)} duplicate titles:")
    print(duplicate_titles)
else:
    print("All titles are unique!")

Found 9130 duplicate titles:
                            title  \
113                   Cheese Ball   
122                   Cheese Ball   
131             Chicken Casserole   
132                 Artichoke Dip   
154                      Taco Dip   
...                           ...   
24990  Linda'S Broccoli Casserole   
24992                Turkey Salad   
24993                   Date Loaf   
24995                 Oatmeal Pie   
24996    Sweet And Sour Meatballs   

                                             ingredients  \
113    ["2 (8 oz.) cream cheese", "2/3 can chopped bl...   
122    ["2 (8 oz.) cream cheese", "4 c. shredded shar...   
131    ["1/2 c. raw rice", "1 can French onion soup",...   
132    ["1 c. mayonnaise", "1 c. grated Parmesan or R...   
154    ["1 small can refried beans", "1 small carton ...   
...                                                  ...   
24990  ["1 stick margarine", "1 c. chopped onion", "1...   
24992  ["4 lb. (4 to 5 c.) cooked, cubed turke

In [None]:
dataset = dataset.drop_duplicates(subset='title', keep='first')

print(f"Dataset now has {len(dataset)} rows.")


Dataset now has 15870 rows.


In [None]:
dataset.to_csv('dataset.csv', index=False)

**Partition the dataset on training, validation and test dataset**

In [None]:
from sklearn.model_selection import train_test_split

train_dataset, temp_data = train_test_split(dataset, test_size=0.2, random_state=42)
val_dataset, test_dataset = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training Set Size: {len(train_dataset)}")
print(f"Validation Set Size: {len(val_dataset)}")
print(f"Test Set Size: {len(test_dataset)}")


Training Set Size: 12696
Validation Set Size: 1587
Test Set Size: 1587


**Text Tokenization and Preprocessing with BART Tokenizer**

In [None]:
from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [None]:
def tokenizer_function(data):
    inputs = tokenizer(data['ingredients'], padding='max_length', truncation=True, return_tensors="pt", max_length=512)
    targets = tokenizer(data['title'], padding='max_length', truncation=True, return_tensors="pt", max_length=512)

    return {
        'input_ids': inputs['input_ids'].squeeze(),
        'attention_mask': inputs['attention_mask'].squeeze(),
        'labels': targets['input_ids'].squeeze()
    }

In [None]:
train_tokenized = train_dataset.apply(tokenizer_function, axis=1)
val_tokenized = val_dataset.apply(tokenizer_function, axis=1)
test_tokenized = test_dataset.apply(tokenizer_function, axis=1)

print("Sample tokenized train data:")
print(train_tokenized.head())

print("Sample tokenized validation data:")
print(val_tokenized.head())

print("Sample tokenized test data:")
print(test_tokenized.head())


Sample tokenized train data:
1041    {'input_ids': [tensor(0), tensor(49329), tenso...
2602    {'input_ids': [tensor(0), tensor(49329), tenso...
7814    {'input_ids': [tensor(0), tensor(49329), tenso...
5886    {'input_ids': [tensor(0), tensor(49329), tenso...
2808    {'input_ids': [tensor(0), tensor(49329), tenso...
dtype: object
Sample tokenized validation data:
18904    {'input_ids': [tensor(0), tensor(49329), tenso...
8245     {'input_ids': [tensor(0), tensor(49329), tenso...
13330    {'input_ids': [tensor(0), tensor(49329), tenso...
2952     {'input_ids': [tensor(0), tensor(49329), tenso...
19897    {'input_ids': [tensor(0), tensor(49329), tenso...
dtype: object
Sample tokenized test data:
16209    {'input_ids': [tensor(0), tensor(49329), tenso...
21738    {'input_ids': [tensor(0), tensor(49329), tenso...
22504    {'input_ids': [tensor(0), tensor(49329), tenso...
20519    {'input_ids': [tensor(0), tensor(49329), tenso...
15819    {'input_ids': [tensor(0), tensor(49329), tenso...
d

**Checking if the original recipe ingredients and title matches the tokenized text**

In [None]:
original_sample = train_dataset.iloc[1]

original_ingredients = original_sample['ingredients']
original_title = original_sample['title']

tokenized_sample = tokenizer_function(original_sample)

decoded_ingredients = tokenizer.decode(tokenized_sample['input_ids'], skip_special_tokens=True)
decoded_title = tokenizer.decode(tokenized_sample['labels'], skip_special_tokens=True)

print(f"Original Title: {original_title}")
print(f"Decoded Title: {decoded_title}")
print(f"Original Ingredients: {original_ingredients}")
print(f"Decoded Ingredients: {decoded_ingredients}")
print("="*50)

title_match = original_title == decoded_title
ingredients_match = original_ingredients == decoded_ingredients

print(f"Title Match: {title_match}")
print(f"Ingredients Match: {ingredients_match}")


Original Title: Goose Or Duck And Sweet Dressing
Decoded Title: Goose Or Duck And Sweet Dressing
Original Ingredients: ["3 lb. dressed duck", "1 1/2 c. apples, diced", "1/3 c. warm water", "salt to taste", "1 c. prunes, soaked but not cooked", "3 Tbsp. butter or fat", "3/4 c. tart pie cherries", "3 c. bread cubes", "juice of 1 orange"]
Decoded Ingredients: ["3 lb. dressed duck", "1 1/2 c. apples, diced", "1/3 c. warm water", "salt to taste", "1 c. prunes, soaked but not cooked", "3 Tbsp. butter or fat", "3/4 c. tart pie cherries", "3 c. bread cubes", "juice of 1 orange"]
Title Match: True
Ingredients Match: True


In [None]:
import torch
from torch.utils.data import Dataset

class RecipeDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        item = self.tokenized_data.iloc[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(item['labels'], dtype=torch.long)
        }

# Create datasets for training, validation, and test
train_dataset_torch = RecipeDataset(train_tokenized)
val_dataset_torch = RecipeDataset(val_tokenized)
test_dataset_torch = RecipeDataset(test_tokenized)


**Create dataloaders to train the model**

In [None]:
from torch.utils.data import DataLoader

# Initialize DataLoader for training, validation, and testing
train_dataloader = DataLoader(train_dataset_torch, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset_torch, batch_size=16)
test_dataloader = DataLoader(test_dataset_torch, batch_size=16)


### Model Training

We load the pre-trained BART model for conditional generation using the BartForConditionalGeneration class. The model is then moved to the appropriate device (GPU or CPU). We use the AdamW optimizer with a learning rate of 5e-5 to fine-tune the model during training.



In [None]:
from transformers import BartForConditionalGeneration, AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)


model.safetensors:  60%|######    | 336M/558M [00:00<?, ?B/s]



Train the model on 3 epochs

In [None]:
from tqdm import tqdm

# Function to train and validate in a single loop
def train_and_validate(model, train_dataloader, val_dataloader, optimizer, device, num_epochs):
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Training phase
        model.train()
        total_train_loss = 0
        for batch in tqdm(train_dataloader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Training Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc="Validating"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f"Validation Loss: {avg_val_loss}")
        print("=" * 50)

num_epochs = 3
train_and_validate(model, train_dataloader, val_dataloader, optimizer, device, num_epochs)

Epoch 1/3


  'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
  'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
  'labels': torch.tensor(item['labels'], dtype=torch.long)
Training: 100%|██████████| 794/794 [32:51<00:00,  2.48s/it]


Training Loss: 0.4699180361296353


Validating: 100%|██████████| 100/100 [01:17<00:00,  1.29it/s]


Validation Loss: 0.03586749782785773
Epoch 2/3


Training: 100%|██████████| 794/794 [33:05<00:00,  2.50s/it]


Training Loss: 0.03861607745834052


Validating: 100%|██████████| 100/100 [01:17<00:00,  1.29it/s]


Validation Loss: 0.03222719900310039
Epoch 3/3


Training: 100%|██████████| 794/794 [33:05<00:00,  2.50s/it]


Training Loss: 0.03367285645081279


Validating: 100%|██████████| 100/100 [01:17<00:00,  1.29it/s]

Validation Loss: 0.0309591961838305





In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/nlp/final_project_nlp_model'
model.save_pretrained(file_path)
tokenizer.save_pretrained(file_path)

Mounted at /content/drive


### Model Evaluation

**Load the model from google drive to evaluate it.**

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

file_path = '/content/drive/My Drive/nlp/final_project_nlp_model'

model = BartForConditionalGeneration.from_pretrained(file_path)
tokenizer = BartTokenizer.from_pretrained(file_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


Check the predicted recipe title based on ingredients from the test dataset

In [None]:
sample_count = 0

for batch in test_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)


    generated_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=50,
        num_beams=5,
        early_stopping=True
    )

    input_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
    generated_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]

    for input_text, generated_text in zip(input_texts, generated_texts):
        print(f"Sample {sample_count + 1} - Input: {input_text}")
        print(f"Sample {sample_count + 1} - Generated Recipe: {generated_text}")
        print("-" * 80)

        sample_count += 1

        if sample_count == 10:
            break
    if sample_count == 10:
        break


  'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
  'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
  'labels': torch.tensor(item['labels'], dtype=torch.long)


Sample 1 - Input: ["1 c. finely ground walnuts", "1/2 c. finely ground sunflower seeds", "1/2 c. finely ground cashews", "1/2 c. raw wheat germ", "1 large onion", "1 c. cooked brown rice"]
Sample 1 - Generated Recipe: Brown Rice And Rice
--------------------------------------------------------------------------------
Sample 2 - Input: ["1 (9-inch) pie crust, baked and cooled", "1 c. sugar", "3 Tbsp. cornstarch", "1/2 c. butter or oleo", "1 Tbsp. grated lemon rind", "1/4 c. lemon juice", "3 unbeaten egg yolks", "1 c. milk", "1 c. sour cream", "whipped dessert topping", "chopped walnuts"]
Sample 2 - Generated Recipe: Lemon Pie
--------------------------------------------------------------------------------
Sample 3 - Input: ["1/4 c. lemon juice", "2 tsp. cornstarch", "1/4 tsp. salt", "1/2 lb. each: scallops and large raw shrimp, peeled and deveined", "1/4 c. margarine", "8 oz. fresh mushrooms, sliced", "1 c. thinly sliced carrots", "3 cloves garlic, finely chopped", "1/2 tsp. thyme leave

Calculating f1 score and exact match

In [None]:
from sklearn.metrics import f1_score
from tqdm import tqdm

def evaluate_model(model, test_dataloader, tokenizer, device):
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    all_references = []

    with torch.no_grad():  # Disable gradient computation
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            # Move data to the selected device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Generate predictions
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=512,
                num_beams=5,
                early_stopping=True
            )

            # Decode predictions and references
            predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
            references = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]

            # Store predictions and references
            all_predictions.extend(predictions)
            all_references.extend(references)

    return all_predictions, all_references


In [None]:
all_predictions, all_references = evaluate_model(model, test_dataloader, tokenizer, device)

# Compute metrics
exact_match_count = sum([1 if pred == ref else 0 for pred, ref in zip(all_predictions, all_references)])
exact_match_score = exact_match_count / len(all_predictions) * 100
print(f"Exact Match Score: {exact_match_score:.2f}%")

# Tokenized F1 Score
tokenized_predictions = [pred.split() for pred in all_predictions]
tokenized_references = [ref.split() for ref in all_references]
flattened_predictions = [token for sublist in tokenized_predictions for token in sublist]
flattened_references = [token for sublist in tokenized_references for token in sublist]
f1 = f1_score(flattened_references, flattened_predictions, average='macro')
print(f"F1 Score: {f1:.2f}")


  'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
  'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
  'labels': torch.tensor(item['labels'], dtype=torch.long)
Evaluating: 100%|██████████| 100/100 [01:30<00:00,  1.10it/s]

Exact Match Score: 64.20%
F1 Score: 66.33





The model is performing reasonably well with an Exact Match Score of 64.20%, indicating that more than half of the predictions are exactly correct. The F1 Score of 66.33 suggests that the model has a decent balance between precision and recall, but there is still room for improvement.

### Telegram Bot

In [None]:
!pip install python-telegram-bot transformers torch




In [None]:
import nest_asyncio
from telegram import Update
from telegram.ext import Application, CommandHandler, MessageHandler, filters
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
from google.colab import drive

nest_asyncio.apply()

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/nlp/final_project_nlp_model'

model = BartForConditionalGeneration.from_pretrained(file_path)
tokenizer = BartTokenizer.from_pretrained(file_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

TOKEN = "7615250432:AAGiQ6ruWbEUvFtdhAyZU_Xzt0spYQkBYu8"

application = Application.builder().token(TOKEN).build()

async def start(update: Update, context):
    await update.message.reply_text("Hello! I'm your recipe bot. Send me a list of ingredients and I'll give you a recipe title.")

def generate_recipe(ingredients_input):
    """Generate a recipe using the model."""
    inputs = tokenizer(ingredients_input, return_tensors="pt").to(device)
    generated_ids = model.generate(inputs['input_ids'], max_length=50, num_beams=5, early_stopping=True)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

async def handle_message(update: Update, context):
    ingredients_input = update.message.text
    recipe_title = generate_recipe(ingredients_input)
    await update.message.reply_text(f"Recipe for your ingredients ({ingredients_input})\n Recipe Title For Given Ingredients: {recipe_title}")

application.add_handler(CommandHandler('start', start))
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

application.run_polling()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
