In [None]:
# Cell 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Cell 2: Install required libraries (Use pyttsx3 as an alternative for gTTS)
!pip install pyttsx3 flask




In [None]:
# Cell 3: Import required libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import pandas as pd
import numpy as np
import re
import os
import pyttsx3
import flask
from flask import Flask, request, jsonify


In [None]:
# Cell 4: Load and preprocess the dataset, including splitting into train, test, and validation sets
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    data['Ingredients_List'] = data['Ingredients_List'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
    data['Preparation_Steps'] = data['Preparation_Steps'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
    data['combined'] = data['Ingredients_List'] + " " + data['Preparation_Steps']

    # Split the dataset into train, test, and validation sets (70%, 15%, 15%)
    train_data = data.sample(frac=0.7, random_state=42)
    temp_data = data.drop(train_data.index)
    test_data = temp_data.sample(frac=0.5, random_state=42)
    val_data = temp_data.drop(test_data.index)

    return train_data, test_data, val_data, data

file_path = "/content/drive/MyDrive/recipe_data.csv"
train_data, test_data, val_data, data = load_and_preprocess_data(file_path)


In [None]:
# Cell 5: Tokenize and prepare sequences
def prepare_sequences(data, train_data, test_data, val_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data['combined'])
    total_words = len(tokenizer.word_index) + 1
    def create_sequences(data):
        input_sequences = []
        for recipe in data['combined']:
            token_list = tokenizer.texts_to_sequences([recipe])[0]
            for i in range(1, len(token_list)):
                n_gram_sequence = token_list[:i+1]
                input_sequences.append(n_gram_sequence)
        return input_sequences

    train_sequences = create_sequences(train_data)
    test_sequences = create_sequences(test_data)
    val_sequences = create_sequences(val_data)
    max_sequence_len = max([len(x) for x in train_sequences + test_sequences + val_sequences])
    def pad_and_split(sequences):
        sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
        X = sequences[:, :-1]
        y = sequences[:, -1]
        y = tf.keras.utils.to_categorical(y, num_classes=total_words)
        return X, y

    X_train, y_train = pad_and_split(train_sequences)
    X_test, y_test = pad_and_split(test_sequences)
    X_val, y_val = pad_and_split(val_sequences)

    return X_train, y_train, X_test, y_test, X_val, y_val, tokenizer, max_sequence_len, total_words

X_train, y_train, X_test, y_test, X_val, y_val, tokenizer, max_sequence_len, total_words = prepare_sequences(data, train_data, test_data, val_data)


In [None]:
# Cell 6: Build and train the model with measures to prevent overfitting
from tensorflow.keras.regularizers import l1_l2

def build_and_train_model(X_train, y_train, X_val, y_val, max_sequence_len, total_words):
    model = Sequential([
        Embedding(total_words, 128, input_length=max_sequence_len-1),
        BatchNormalization(),
        LSTM(256, return_sequences=True, kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
        Dropout(0.5),
        LSTM(128, kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
        Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
        Dropout(0.5),
        Dense(total_words, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
        ModelCheckpoint("/content/drive/MyDrive/best_model.h5", monitor='val_loss', save_best_only=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)
    ]
    history = model.fit(X_train, y_train, epochs=7, batch_size=64, validation_data=(X_val, y_val), callbacks=callbacks, verbose=1)
    return model

model = build_and_train_model(X_train, y_train, X_val, y_val, max_sequence_len, total_words)


Epoch 1/7




[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.2342 - loss: 24.7717



[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 145ms/step - accuracy: 0.2343 - loss: 24.7486 - val_accuracy: 0.6494 - val_loss: 2.3539 - learning_rate: 0.0010
Epoch 2/7
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.5655 - loss: 2.3643



[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 140ms/step - accuracy: 0.5656 - loss: 2.3641 - val_accuracy: 0.7434 - val_loss: 1.7420 - learning_rate: 0.0010
Epoch 3/7
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 0.7101 - loss: 1.9309



[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 141ms/step - accuracy: 0.7102 - loss: 1.9308 - val_accuracy: 0.8874 - val_loss: 1.3852 - learning_rate: 0.0010
Epoch 4/7
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.8167 - loss: 1.5694



[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 147ms/step - accuracy: 0.8167 - loss: 1.5693 - val_accuracy: 0.9362 - val_loss: 1.2152 - learning_rate: 0.0010
Epoch 5/7
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.8834 - loss: 1.4063



[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 148ms/step - accuracy: 0.8834 - loss: 1.4062 - val_accuracy: 0.9846 - val_loss: 1.0579 - learning_rate: 0.0010
Epoch 6/7
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.9172 - loss: 1.2183



[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 144ms/step - accuracy: 0.9172 - loss: 1.2183 - val_accuracy: 1.0000 - val_loss: 0.9745 - learning_rate: 0.0010
Epoch 7/7
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 0.9317 - loss: 1.1342



[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 142ms/step - accuracy: 0.9317 - loss: 1.1342 - val_accuracy: 0.9839 - val_loss: 0.8919 - learning_rate: 0.0010


In [None]:
# Cell 7: Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
model.save("/content/drive/MyDrive/final_recipe_generator.h5")
print("Model saved as final_recipe_generator.h5")


[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 32ms/step - accuracy: 0.9825 - loss: 0.8907




Test Accuracy: 98.31%
Model saved as final_recipe_generator.h5


In [None]:
# Cell 8: Enhanced Generate recipes function with temperature-based sampling and correct parameters
def generate_recipe(prompt, model, tokenizer, max_length=100, temperature=0.7, top_k=50, top_p=0.95):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)  # Set attention mask
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, temperature=temperature, top_k=top_k, top_p=top_p, pad_token_id=tokenizer.eos_token_id, attention_mask=attention_mask)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
# Cell 9: Skipping Voice assistant function using pyttsx3
# def speak(text):
#     engine = pyttsx3.init()
#     engine.say(text)
#     engine.runAndWait()


In [None]:
!pip install transformers datasets


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


In [None]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Tokenize the dataset
def load_dataset(file_path, tokenizer):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128
    )
    return dataset


In [None]:
# Prepare the dataset and data collator
file_path = "/content/drive/MyDrive/recipe_data.csv"  # Update the path as needed
train_dataset = load_dataset(file_path, tokenizer)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)




In [18]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install transformers torch pandas


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [40]:
# List files in the specified directory to verify paths
import os

directory = '/content/drive/MyDrive/'
files = os.listdir(directory)
print(files)

# Load the recipe datasets with corrected paths and specified encoding
recipe_nlg_path = '/content/drive/MyDrive/RAW_recipes.csv'
all_recipes_path = '/content/drive/MyDrive/allrecipes.csv'
epicurious_recipes_path = '/content/drive/MyDrive/PP_recipes.csv'

recipe_nlg = pd.read_csv(recipe_nlg_path, encoding='ISO-8859-1')
all_recipes = pd.read_csv(all_recipes_path, encoding='ISO-8859-1')
epicurious_recipes = pd.read_csv(epicurious_recipes_path, encoding='ISO-8859-1')

# Inspect columns
print("Columns in recipe_nlg:")
print(recipe_nlg.columns)
print("Columns in all_recipes:")
print(all_recipes.columns)
print("Columns in epicurious_recipes:")
print(epicurious_recipes.columns)


['IMG_20220309_094140_copy_808x960_1.jpg', 'WhatsApp Image 2024-07-19 at 10.17.39 PM.jpeg', 'IMG-20240924-WA0005.jpg', 'RAW_recipes.csv', 'archive', 'Colab Notebooks', 'best_model.h5', 'fine_tuned_gpt2', 'allrecipes (1).csv', 'allrecipes.csv', 'PP_recipes.csv', 'recipe_data.csv', 'Copy of best_model.h5', 'model.safetensors']
Columns in recipe_nlg:
Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')
Columns in all_recipes:
Index(['group', 'name', 'rating', 'n_rater', 'n_reviewer', 'summary',
       'process', 'ingredient'],
      dtype='object')
Columns in epicurious_recipes:
Index(['id', 'i', 'name_tokens', 'ingredient_tokens', 'steps_tokens',
       'techniques', 'calorie_level', 'ingredient_ids'],
      dtype='object')


In [44]:
# Combine the datasets into a unified DataFrame
# Ensure the columns are renamed or adjusted for consistency
recipe_nlg.rename(columns={"name": "title"}, inplace=True)
all_recipes.rename(columns={"name": "title", "steps": "instructions"}, inplace=True)
epicurious_recipes.rename(columns={"name": "title", "steps": "instructions"}, inplace=True)

# Ensure columns exist before concatenating
for col in ['title', 'ingredients', 'instructions']:
    if col not in recipe_nlg.columns:
        recipe_nlg[col] = None
    if col not in all_recipes.columns:
        all_recipes[col] = None
    if col not in epicurious_recipes.columns:
        epicurious_recipes[col] = None

combined_recipes = pd.concat([recipe_nlg, all_recipes, epicurious_recipes], ignore_index=True)
combined_recipes.dropna(subset=['title', 'ingredients', 'instructions'], inplace=True)


In [45]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast

# Define model name or path
model_name = "gpt2"  # Default model
model_path = "/content/drive/MyDrive/fine_tuned_gpt2"  # Update if necessary

# Check if model directory exists and contains necessary files
model_files = ["pytorch_model.bin", "model.safetensors", "tf_model.h5", "model.ckpt.index", "flax_model.msgpack"]
model_exists = os.path.exists(model_path) and any(os.path.exists(os.path.join(model_path, f)) for f in model_files)

# Load model and tokenizer
if model_exists:
    print("Loading fine-tuned model...")
    model = GPT2LMHeadModel.from_pretrained(model_path)
    # Use GPT2TokenizerFast for better performance if available
    try:
        tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
    except:
        tokenizer = GPT2Tokenizer.from_pretrained(model_path)
else:
    print(f"Warning: Model not found at '{model_path}'. Using default GPT-2 model.")
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name) # Use GPT2Tokenizer if Fast isn't working



In [46]:
# Enhanced Generate recipes function using GPT-2's generate method with refined prompts
def generate_recipe(prompt, model, tokenizer, max_length=200, temperature=0.7, top_k=50, top_p=0.95):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)  # Set attention mask
    outputs = model.generate(
        inputs,
        max_length=max_length,
        max_new_tokens=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=attention_mask,
        do_sample=True  # Enable sampling
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extracting ingredients and instructions from combined dataset with fallback mechanism
def get_recipe_from_dataset(dish_name):
    try:
        recipe = combined_recipes[combined_recipes['title'].str.contains(dish_name, case=False, na=False)].sample(n=1)
    except ValueError:
        recipe = combined_recipes.sample(n=1)  # Fallback to a random recipe if no match is found
    ingredients = recipe['ingredients'].values[0] if 'ingredients' in recipe.columns else ["No ingredients available"]
    instructions = recipe['instructions'].values[0] if 'instructions' in recipe.columns else ["No instructions available"]
    return ingredients, instructions

# Refined prompt for recipe generation with dynamic ingredients
def get_refined_prompt(dish_name):
    ingredients, instructions = get_recipe_from_dataset(dish_name)

    prompt = f"Recipe for {dish_name}:\n\nIngredients:\n" + "\n".join(ingredients) + "\n\nInstructions:\n" + "\n".join(instructions)
    return prompt


In [None]:
# Improved Chatbot Function with better recipe handling and input matching
def chatbot(model, tokenizer):
    print("Welcome to Flavour Fusion! How can I help you today? Here are some recipes you might like:")
    available_recipes = combined_recipes['Recipe_Name'].dropna().unique().tolist()
    print(", ".join(available_recipes[:10]))  # Show the first 10 recipes

    while True:
        user_input = input("You: ").lower()  # Convert input to lower case for consistent comparison
        if "list recipes" in user_input:
            print("Here are some recipes you might like:")
            print(", ".join(available_recipes[:10]))  # Show the first 10 recipes
        else:
            # Find the best match for the user's input in the available recipes
            best_match = None
            for recipe in available_recipes:
                if isinstance(recipe, str) and user_input in recipe.lower():
                    best_match = recipe
                    break

            if best_match:
                refined_prompt = get_refined_prompt(best_match)
                recipe = generate_recipe(refined_prompt, model, tokenizer)
                print(f"Bot: Here's a recipe for {best_match}:\n{recipe}")
            elif "exit" in user_input:
                print("Bot: Goodbye!")
                break
            else:
                print("Bot: I didn't understand that. Can you rephrase? You can also ask to 'list recipes' to see available options.")

chatbot(model, tokenizer)


Welcome to Flavour Fusion! How can I help you today? Here are some recipes you might like:
Beef Tacos, Spaghetti Bolognese, Chicken Curry, Vegetable Stir Fry, Margherita Pizza
