# Preparing the dataset

In [3]:
import py7zr
import pandas as pd
import os


# Function to categorize diets based on ingredients
def categorize_diet(ingredients):
    ingredients = ingredients.lower()
    if any(word in ingredients for word in ["meat", "chicken", "beef", "pork"]):
        return 'non-vegetarian'
    if "fish" in ingredients or "seafood" in ingredients:
        return 'pescatarian'
    if "cheese" in ingredients or "egg" in ingredients or "dairy" in ingredients:
        return 'vegetarian'
    if all(word not in ingredients for word in ["meat", "egg", "cheese", "dairy"]):
        return 'vegan'
    return 'unknown'

# Function to categorize allergies based on ingredients
def check_allergies(ingredients):
    allergies = []
    if any(word in ingredients for word in ["nut", "almond", "peanut"]):
        allergies.append("nut-free")
    if "dairy" in ingredients or "milk" in ingredients:
        allergies.append("dairy-free")
    if "gluten" in ingredients or "wheat" in ingredients:
        allergies.append("gluten-free")
    return allergies

# Load and clean the data
def load_and_clean_data(file_path):
    with py7zr.SevenZipFile(file_path, mode='r') as archive:
        file_names = archive.getnames()
        csv_file_name = file_names[0]
        csv_content = archive.read([csv_file_name])[csv_file_name]

        # Directly use csv_content, which is a BytesIO object
        df = pd.read_csv(csv_content)
    
    # Categorize diet and check for allergies
    df['diet'] = df['ingredients'].apply(categorize_diet)
    df['allergies'] = df['ingredients'].apply(check_allergies)

    # Taste preferences (basic rule-based assignment)
    def assign_taste(ingredients):
        ingredients = ingredients.lower()
        if any(word in ingredients for word in ["chili"]):
            return 'spicy'
        if any(word in ingredients for word in ["sugar", "honey", "sweet"]):
            return 'sweet'
        if any(word in ingredients for word in ["lemon", "vinegar", "sour"]):
            return 'sour'
        return 'savory'
    
    # Add taste column
    df['taste'] = df['ingredients'].apply(assign_taste)

    return df

# Absolute path to the .7z file in the 'dataset' directory
seven_z_file = r'c:\Users\bpretet\Documents\cookBot\dataset\RecipeNLG_dataset.7z'

# Check if the file exists
if not os.path.exists(seven_z_file):
    print(f"File not found: {seven_z_file}")
else:
    print(f"Found the file at: {seven_z_file}")
    
    # Load and clean the dataset
    cleaned_data = load_and_clean_data(seven_z_file)
    
    # Display the first few rows of the cleaned data

    cleaned_data = cleaned_data.drop(cleaned_data.columns[0], axis=1)



Found the file at: c:\Users\bpretet\Documents\cookBot\dataset\RecipeNLG_dataset.7z


In [8]:
columns_to_drop = ['ingredients', 'directions', 'link', 'source']
data_for_model = cleaned_data.drop(columns=columns_to_drop, errors='ignore')

Unnamed: 0,title,ingredients,directions,link,source,NER,diet,allergies,taste
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",vegan,"[nut-free, dairy-free]",sweet
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom...",non-vegetarian,[],sour
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...",vegetarian,[],savory
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo...",non-vegetarian,[],savory
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu...",vegan,[nut-free],sweet
...,...,...,...,...,...,...,...,...,...
2231137,Sunny's Fake Crepes,"[""1/2 cup chocolate hazelnut spread (recommend...","[""Spread hazelnut spread on 1 side of each tor...",www.foodnetwork.com/recipes/sunny-anderson/sun...,Recipes1M,"[""chocolate hazelnut spread"", ""tortillas"", ""bu...",vegan,"[nut-free, gluten-free]",savory
2231138,Devil Eggs,"[""1 dozen eggs"", ""1 paprika"", ""1 salt and pepp...","[""Boil eggs on medium for 30mins."", ""Then cool...",cookpad.com/us/recipes/355411-devil-eggs,Recipes1M,"[""eggs"", ""paprika"", ""salt"", ""choice"", ""miracle...",vegetarian,[],savory
2231139,Extremely Easy and Quick - Namul Daikon Salad,"[""150 grams Daikon radish"", ""1 tbsp Sesame oil...","[""Julienne the daikon and squeeze out the exce...",cookpad.com/us/recipes/153324-extremely-easy-a...,Recipes1M,"[""radish"", ""Sesame oil"", ""White sesame seeds"",...",vegan,[],savory
2231140,Pan-Roasted Pork Chops With Apple Fritters,"[""1 cup apple cider"", ""6 tablespoons sugar"", ""...","[""In a large bowl, mix the apple cider with 4 ...",cooking.nytimes.com/recipes/1015164,Recipes1M,"[""apple cider"", ""sugar"", ""kosher salt"", ""bay l...",non-vegetarian,[],sweet


# Analysing the Dataset

In [10]:
from collections import Counter
import ast
import difflib  # For comparing word similarity

# Step 1: Extract the 'NER' column from the dataset
ner_column = data_for_model['NER']

# Step 2: Tokenize and count word occurrences
word_list = []

# Convert string representation of lists into actual lists and extend word_list
for entry in ner_column:
    words = ast.literal_eval(entry)
    word_list.extend(words)

# Step 3: Count frequency of each word
word_counts = Counter(word_list)

# Step 4: Get the number of unique words
unique_words = list(word_counts.keys())
num_unique_words = len(unique_words)
print(f"Number of different words in the 'NER' column: {num_unique_words}")

# Step 5: Print the 30 most frequent words
top_30_words = word_counts.most_common(30)
print("\nTop 30 most used words:")
for word, count in top_30_words:
    print(f"{word}: {count}")


Number of different words in the 'NER' column: 234059

Top 30 most used words:
salt: 890741
sugar: 620027
butter: 493823
flour: 466110
eggs: 401276
onion: 372469
garlic: 358364
milk: 346769
water: 326092
vanilla: 270381
olive oil: 197877
pepper: 179305
brown sugar: 174447
tomatoes: 163933
egg: 160507
baking powder: 148277
lemon juice: 146414
Salt: 122558
cinnamon: 117927
sour cream: 116682
cream cheese: 114423
margarine: 112742
celery: 112676
baking soda: 110690
parsley: 102151
chicken: 101505
onions: 98903
vegetable oil: 91395
oil: 85600
mayonnaise: 84822

Words that are at least 80% similar:
'brown sugar' and 'brown Sugar' are at least 80% similar.
'brown sugar' and 'Brown Sugar' are at least 80% similar.
'brown sugar' and 'bowl sugar' are at least 80% similar.
'brown sugar' and 'dark brown sugar' are at least 80% similar.
'brown sugar' and 'Brown sugar' are at least 80% similar.
'brown sugar' and 'low sugar' are at least 80% similar.
'brown sugar' and 'burnt sugar' are at least 80% 

KeyboardInterrupt: 

In [7]:
import matplotlib.pyplot as plt
from collections import Counter
import ast

ner_column = cleaned_data['NER']

# Step 3: Tokenize and count word occurrences
word_list = []

# Assuming the entries in 'NER' column are lists in string format
for entry in ner_column:
    words = ast.literal_eval(entry)  # Convert string representation of list to actual list
    word_list.extend(words)

# Step 4: Count frequency of each word
word_counts = Counter(word_list)

# Step 5: Plot the word frequency
words, counts = zip(*word_counts.most_common())

plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Word Frequency in NER Column')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Display the plot
plt.show()