# Preparing the dataset

In [2]:
import py7zr
import pandas as pd
import os


# Function to categorize diets based on ingredients
def categorize_diet(ingredients):
    ingredients = ingredients.lower()
    if any(word in ingredients for word in ["meat", "chicken", "beef", "pork"]):
        return 'non-vegetarian'
    if "fish" in ingredients or "seafood" in ingredients:
        return 'pescatarian'
    if "cheese" in ingredients or "egg" in ingredients or "dairy" in ingredients:
        return 'vegetarian'
    if all(word not in ingredients for word in ["meat", "egg", "cheese", "dairy"]):
        return 'vegan'
    return 'unknown'

# Function to categorize allergies based on ingredients
def check_allergies(ingredients):
    allergies = []
    if any(word in ingredients for word in ["nut", "almond", "peanut"]):
        allergies.append("nut-free")
    if "dairy" in ingredients or "milk" in ingredients:
        allergies.append("dairy-free")
    if "gluten" in ingredients or "wheat" in ingredients:
        allergies.append("gluten-free")
    return allergies

# Load and clean the data
def load_and_clean_data(file_path):
    with py7zr.SevenZipFile(file_path, mode='r') as archive:
        file_names = archive.getnames()
        csv_file_name = file_names[0]
        csv_content = archive.read([csv_file_name])[csv_file_name]

        # Directly use csv_content, which is a BytesIO object
        df = pd.read_csv(csv_content)
    
    # Categorize diet and check for allergies
    df['diet'] = df['ingredients'].apply(categorize_diet)
    df['allergies'] = df['ingredients'].apply(check_allergies)

    # Taste preferences (basic rule-based assignment)
    def assign_taste(ingredients):
        ingredients = ingredients.lower()
        if any(word in ingredients for word in ["chili"]):
            return 'spicy'
        if any(word in ingredients for word in ["sugar", "honey", "sweet"]):
            return 'sweet'
        if any(word in ingredients for word in ["lemon", "vinegar", "sour"]):
            return 'sour'
        return 'savory'
    
    # Add taste column
    df['taste'] = df['ingredients'].apply(assign_taste)

    return df

# Absolute path to the .7z file in the 'dataset' directory
seven_z_file = r'c:\Users\bpretet\Documents\cookBot\dataset\RecipeNLG_dataset.7z'

# Check if the file exists
if not os.path.exists(seven_z_file):
    print(f"File not found: {seven_z_file}")
else:
    print(f"Found the file at: {seven_z_file}")
    
    # Load and clean the dataset
    cleaned_data = load_and_clean_data(seven_z_file)
    
    # Display the first few rows of the cleaned data

    cleaned_data = cleaned_data.drop(cleaned_data.columns[0], axis=1)



Found the file at: c:\Users\bpretet\Documents\cookBot\dataset\RecipeNLG_dataset.7z


# Preprocessing the Dataset

In [3]:
#vectorising the recipes using TF-ID
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine ingredients into a single string for vectorization
cleaned_data['ingredients_joined'] = cleaned_data['ingredients'].apply(lambda x: ' '.join(x.lower().split(',')))

# Create the TF-IDF vectorizer and fit it on the ingredients
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_data['ingredients_joined'])



Ingredient Matching (convert input into vector and match with preprocessed TF-ID)

In [4]:
def get_recipe_recommendations(user_ingredients, top_n=5):
    user_ingredients_str = ' '.join([ing.lower().strip() for ing in user_ingredients])
    user_vec = vectorizer.transform([user_ingredients_str])
    
    # Calculate cosine similarity between user input and recipe ingredients
    cosine_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
    
    # Get top N recipe indices
    top_recipes_idx = cosine_similarities.argsort()[-top_n:][::-1]
    return cleaned_data.iloc[top_recipes_idx]


                             title  \
1316616     Tomato Basil Sandwich    
655088                    Eggplant   
1598590  Tomato Pesto Garden Eggs    
1465924    Tomato-Basil Mayonnaise   
1527188          Eggplant Parmesan   

                                               ingredients        diet  \
1316616  ["Cream Cheese", "Sliced Tomato", "Salt, peppe...  vegetarian   
655088   ["eggplant", "flour", "eggs", "grated cheese",...  vegetarian   
1598590  ["eggs", "cottage cheese", "tomato", "basil", ...  vegetarian   
1465924  ["1 1/2 cups mayonnaise", "1/2 cup fresh basil...       vegan   
1527188  ["2 tablespoons olive oil", "1 garlic clove, m...  vegetarian   

        allergies   taste  
1316616        []   sweet  
655088         []  savory  
1598590        []  savory  
1465924        []  savory  
1527188        []  savory  


In [None]:
# Example usage
user_ingredients = ['tomato', 'basil', 'cheese']
user_diet = 'vegetarian'
user_allergies = ['nut-free', 'gluten-free']

recommended_recipes = recommend_recipes(user_ingredients, user_diet=user_diet, user_allergies=user_allergies)
print(recommended_recipes[['title', 'ingredients', 'diet', 'allergies']])
