In [60]:
import pandas as pd
import nltk
import json
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

## Pre-process the data

In [61]:
# Load the dataset
raw_recipe_path = "../../Dataset/RAW_recipes.csv"
df_raw_recipe = pd.read_csv(raw_recipe_path)

# Remove Na datas
def preprocess_data(df):
    df.dropna(inplace=True)

    # Filtering attribute
    df = df[["name", "id", "ingredients"]]
    
    # Remove recipes duplicate
    df = df.drop_duplicates(subset=["name", "ingredients"])

    return df

df = preprocess_data(df_raw_recipe)

In [62]:
ingredients_array_values = df["ingredients"].apply(lambda line : re.findall(r"\b[a-z]\w+\b", line)).values
ingredients_array    = df["ingredients"].apply(lambda line : re.findall(r"'(.*?)'", line))

### Check word frequency

In [63]:
vocabulary = nltk.FreqDist()

for ingredients in ingredients_array.values:
    # ingredients = ingredients.split()
    ingredients = ingredient_parse(ingredients)
    vocabulary.update(ingredients)

for word, frequency in vocabulary.most_common():
    print(f'{word} : {frequency}')

onion : 53494
garlic : 39989
butter : 33465
egg : 26220
olive oil : 24024
flour : 14387
milk : 13980
lemon juice : 12274
all-purpose flour : 10371
carrot : 9374
bell pepper : 9207
  : 8761
tomato : 8690
parmesan cheese : 8377
potato : 7911
parsley : 7464
mayonnaise : 7271
vegetable oil : 7179
cream cheese : 7175
baking powder : 6941
celery : 6893
sour cream : 6710
boneless chicken breast : 6646
ginger : 6344
cumin : 6298
soy sauce : 6292
honey : 6161
cinnamon : 6070
beef : 5588
extra virgin olive oil : 5516
garlic powder : 5422
chicken broth : 5360
cheddar cheese : 5317
bacon : 5300
oil : 5218
wine : 5163
unsalted butter : 5001
vanilla : 4969
mushroom : 4915
lime juice : 4912
baking soda : 4558
cilantro : 4434
chili powder : 4221
worcestershire sauce : 4146
kosher salt : 4087
bean : 3995
dijon mustard : 3860
chicken breast : 3776
granulated sugar : 3769
basil : 3743
paprika : 3710
orange juice : 3450
zucchini : 3301
mustard : 3279
banana : 3139
wine vinegar : 3132
shallot : 3102
mozzar

In [64]:
def ingredient_parse(ingredients_array):

    lemmatizer = nltk.WordNetLemmatizer()

    words_to_remove = [
        ',', ' ', "&", 'fresh', 'ground', 'red', 'black', 'green', 'brown', 'white','and', 'all', 'purpose',
        'of', 'dry', 'frozen', 'light', 'skinless', 'yellow', 'half', 'new', 'whole', 'and', 'of', 'clove',
        'prepared', "hard-cooked", "extract", "semi-sweet", "in", "fillet", "purple", "s", "hot", "yolk", "freshly",
        "table", "boiling", "warm", "cold"
    ]

    ordinary = [
        "salt", "pepper", "water", "sugar", "salt pepper", "seasoning salt", "ice water", "tap water"
    ]
    

    ingredient_list = []
    for items in ingredients_array:
        # print(items)
        
        # Split by word
        items = items.split(" ")
        # Lowercase
        items = [word.lower() for word in items]
        # Clean symbols
        items = [word.strip('", ') for word in items]
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]

        # items = [word for word in items if word not in ordinary]

        if items:
            items = ' '.join(items)
            
            # Remove ordinary ingredients
            if items in ordinary:
                break

            ingredient_list.append(items)
            

    return ingredient_list


In [65]:
ingredients_clean = ingredients_array.apply(ingredient_parse).values
labels = df["id"].values

In [74]:
ingredients_clean

array([list(['winter squash', 'mexican seasoning', 'mixed spice', 'honey', 'butter', 'olive oil']),
       list(['pizza crust', 'sausage patty', 'egg', 'milk']),
       list(['beef', 'onion', 'diced tomato', 'tomato paste', 'tomato soup', 'rotel tomato', 'kidney bean']),
       ...,
       list(['egg', 'mayonnaise', 'dijon mustard', 'salt-free cajun seasoning', 'tabasco sauce']),
       list(['butter', 'eagle brand condensed milk']),
       list(['granulated sugar', 'shortening', 'egg', 'flour', 'cream tartar', 'baking soda', 'vanilla'])],
      dtype=object)

### KMeans

In [67]:
# Convert ingredients to TF-IDF vectors
# combined = []
# for sublist in ingredients_array_values:
#     combined += sublist

# combined = ingredients_array_values

# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(ingredients_array_values)

# # Apply K-means clustering
# k = 10
# kmeans = KMeans(n_clusters=k)
# kmeans.fit(X)

# # Assign each ingredient to a cluster
# ingredient_clusters = kmeans.predict(X)

# # Print ingredient clusters
# for cluster_id in range(k):
#     cluster_ingredients = [ingredient for i, ingredient in enumerate(ingredients) if ingredient_clusters[i] == cluster_id]
#     print(f"Cluster {cluster_id + 1}: {cluster_ingredients}")

In [68]:
def get_input():
    user_input = input("Masukkan bahan-bahan pisah dengan (, ) : ")
    return user_input.split(", ")

In [69]:
def filter_recipe(user_input):
    matched_recipes = []

    for ingre, label in zip(ingredients_clean, labels):
        if all(i in user_input for i in ingre):
            matched_recipes.append(label)

    return matched_recipes

In [70]:
# print(list(i in ingredients_clean[] for i in user_input))
# print(user_input)
# print(ingredients_clean[107699])
# tuple(zip(ingredients_clean, labels))

In [71]:
user_input = get_input()
reccomendation = filter_recipe(user_input)

if reccomendation:
    print("Resep rekomendasi :")
    for i in reccomendation[0:10]:
        recipe_name = df.loc[df["id"] == int(i)]["name"].item()
        print(f"{recipe_name}, {i}")

else:
    print("Resep tidak ditemukan")
    print(reccomendation)

Resep rekomendasi :
beat this  banana bread, 75452
cream  of spinach soup, 76808
emotional balance  spice mixture, 48156
get the sensation  brownies, 27087
the best  banana bread  or muffins, 39363
1 in canada chocolate chip cookies, 453467
millionaire pound cake, 286009
250 chocolate chip cookies recipe, 177187
250 00 chocolate chip cookies, 118843
denauseating  with ginger tea, 30473


### Save recipes as JSON

In [72]:
res = []

for ingre, label in zip(ingredients_clean, labels):
    temp = {
        "recipes_id" : str(label),
        "ingredients" : ingre
    }
    res.append(temp)

with open("clean_recipes.json", "w") as outfile:
    json.dump(res, outfile)