In [99]:
import pandas as pd
import nltk
import json
import re
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

## Pre-process the data

In [77]:
# Load the dataset
raw_recipe_path = "../../Dataset/RAW_recipes.csv"
df_raw_recipe = pd.read_csv(raw_recipe_path)

# Remove Na datas
def preprocess_data(df):
    df.dropna(inplace=True)

    # Filtering attribute
    df = df[["name", "id", "ingredients"]]
    
    # Remove recipes duplicate
    df = df.drop_duplicates(subset=["name", "ingredients"])

    return df

df = preprocess_data(df_raw_recipe)

In [78]:
ingredients_array_values = df["ingredients"].apply(lambda line : re.findall(r"\b[a-z]\w+\b", line)).values
ingredients_array    = df["ingredients"].apply(lambda line : re.findall(r"'(.*?)'", line))

### Check word frequency

In [79]:
# vocabulary = nltk.FreqDist()

# for ingredients in ingredients_array.values:
#     # ingredients = ingredients.split()
#     ingredients = ingredient_parse(ingredients)
#     vocabulary.update(ingredients)

# for word, frequency in vocabulary.most_common():
#     print(f'{word} : {frequency}')

In [80]:
def ingredient_parse(ingredients_array):

    lemmatizer = nltk.WordNetLemmatizer()

    words_to_remove = [
        ',', ' ', "&", 'fresh', 'ground', 'red', 'black', 'green', 'white','and', 'all', 'purpose',
        'of', 'dry', 'frozen', 'light', 'skinless', 'yellow', 'half', 'new', 'whole', 'and', 'of', 'clove',
        'prepared', "hard-cooked", "extract", "semi-sweet", "in", "fillet", "purple", "s", "hot", "yolk", "freshly",
        "table", "boiling", "warm", "cold"
    ]

    ordinary = [
        "salt", "pepper", "water", "sugar", "salt pepper", "seasoning salt", "ice water", "tap water"
    ]
    

    ingredient_list = []
    for items in ingredients_array:
        # print(items)
        
        # Split by word
        items = items.split(" ")
        # Lowercase
        items = [word.lower() for word in items]
        # Clean symbols
        items = [word.strip('", ') for word in items]
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]

        # items = [word for word in items if word not in ordinary]

        if items:
            items = ' '.join(items)

            # Last clean from spaces
            items = items.strip(" ")
            
            # Remove ordinary ingredients
            if not(items in ordinary) and items != " ":
                ingredient_list.append(items)
            

    return ingredient_list


In [81]:
ingredients_clean = ingredients_array.apply(ingredient_parse).values
labels = df["id"].values

In [82]:
ingredients_clean

array([list(['winter squash', 'mexican seasoning', 'mixed spice', 'honey', 'butter', 'olive oil']),
       list(['pizza crust', 'sausage patty', 'egg', 'milk', 'cheese']),
       list(['beef', 'onion', 'diced tomato', 'tomato paste', 'tomato soup', 'rotel tomato', 'kidney bean', 'chili powder', 'cumin', 'lettuce', 'cheddar cheese']),
       ...,
       list(['egg', 'mayonnaise', 'dijon mustard', 'salt-free cajun seasoning', 'tabasco sauce', 'italian parsley']),
       list(['butter', 'eagle brand condensed milk', 'brown sugar', 'sour cream', 'egg', 'nutmeg', 'self-rising flour', 'bisquick', 'wooden popsicle stick']),
       list(['granulated sugar', 'shortening', 'egg', 'flour', 'cream tartar', 'baking soda', 'vanilla'])],
      dtype=object)

### KMeans

In [83]:
# Convert ingredients to TF-IDF vectors
# combined = []
# for sublist in ingredients_array_values:
#     combined += sublist

# combined = ingredients_array_values

# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(ingredients_array_values)

# # Apply K-means clustering
# k = 10
# kmeans = KMeans(n_clusters=k)
# kmeans.fit(X)

# # Assign each ingredient to a cluster
# ingredient_clusters = kmeans.predict(X)

# # Print ingredient clusters
# for cluster_id in range(k):
#     cluster_ingredients = [ingredient for i, ingredient in enumerate(ingredients) if ingredient_clusters[i] == cluster_id]
#     print(f"Cluster {cluster_id + 1}: {cluster_ingredients}")

In [84]:
def get_input():
    user_input = input("Masukkan bahan-bahan pisah dengan (, ) : ")
    return user_input.split(", ")

In [85]:
def filter_recipe(user_input):
    matched_recipes = []

    for ingre, label in zip(ingredients_clean, labels):
        if all(i in user_input for i in ingre):
            # print(ingre, label)
            matched_recipes.append(label)

    return matched_recipes

In [86]:
# print(list(i in ingredients_clean[] for i in user_input))
# print(user_input)
# print(ingredients_clean[107699])
# tuple(zip(ingredients_clean, labels))

In [106]:
user_input = get_input()
reccomendation = filter_recipe(user_input)

# Random recipes
random.shuffle(reccomendation)

if reccomendation:
    print("Resep rekomendasi :")
    for i in reccomendation[0:10]:
        recipe_name = df.loc[df["id"] == int(i)]["name"].item()
        ingre = df.loc[df["id"] == int(i)]["ingredients"].item()
        print(f"{recipe_name}, {i}, {ingre}")

else:
    print("Resep tidak ditemukan")
    # print(reccomendation)

Resep rekomendasi :
easy peel boiled eggs, 531398, ['egg', 'water']
boiled eggs, 160167, ['eggs', 'water', 'salt']
souffle omelet  puffy omelet, 89596, ['egg whites', 'water', 'salt', 'egg yolks', 'butter']
calming cedar tea, 352667, ['greens', 'water', 'white sugar']
caramelized simple syrup, 179600, ['sugar', 'boiling water']
hard boiled eggs  easy to peel, 382101, ['eggs', 'water', 'salt']
oeufs au plat, 470634, ['eggs', 'butter', 'salt', 'pepper']
easter hard boiled eggs, 354371, ['egg']
please ignore, 409347, ['egg', 'water']
high altitude hard boiled eggs, 408065, ['eggs', 'water']


### Save recipes as JSON

In [107]:
# res = []

# for ingre, label in zip(ingredients_clean, labels):
#     temp = {
#         "recipes_id" : str(label),
#         "ingredients" : ingre
#     }
#     res.append(temp)

# with open("clean_recipes.json", "w") as outfile:
#     json.dump(res, outfile)