In [13]:
import pandas as pd
import nltk
import json
import re

## Pre-process the data

In [2]:
# Load the dataset
raw_recipe_path = "../../Dataset/RAW_recipes.csv"
df_raw_recipe = pd.read_csv(raw_recipe_path)

# Remove Na datas
def preprocess_data(df):
    df.dropna(inplace=True)

    # Filtering attribute
    df = df[["name", "id", "ingredients"]]
    
    # Remove recipes duplicate
    df = df.drop_duplicates(subset=["name", "ingredients"])

    return df

df = preprocess_data(df_raw_recipe)

In [3]:
ingredients_array_values = df["ingredients"].apply(lambda line : re.findall(r"\b[a-z]\w+\b", line)).values
ingredients_array    = df["ingredients"].apply(lambda line : re.findall(r"'(.*?)'", line))

### Check word frequency

In [4]:
vocabulary = nltk.FreqDist()

for ingredients in ingredients_array.values:
    # ingredients = ingredients.split()
    # ingredients = ingredient_parse(ingredients)
    vocabulary.update(ingredients)

for word, frequency in vocabulary.most_common():
    print(f'{word} : {frequency}')

salt : 83311
butter : 53387
sugar : 43211
onion : 38062
water : 33690
eggs : 32689
olive oil : 32184
garlic cloves : 25587
flour : 25509
milk : 24771
pepper : 21616
brown sugar : 18185
garlic : 17143
all-purpose flour : 17089
baking powder : 17020
egg : 16781
salt and pepper : 15063
parmesan cheese : 14469
baking soda : 13705
lemon juice : 13647
vegetable oil : 13458
black pepper : 12697
vanilla : 12650
cinnamon : 12184
tomatoes : 11577
sour cream : 11381
,  : 10964
garlic powder : 10549
vanilla extract : 9703
garlic clove : 9682
honey : 9655
oil : 9640
onions : 9583
cream cheese : 9487
celery : 9219
cheddar cheese : 8735
unsalted butter : 8674
soy sauce : 8643
mayonnaise : 8561
chicken broth : 7797
paprika : 7710
worcestershire sauce : 7605
extra virgin olive oil : 7563
fresh parsley : 7456
cornstarch : 7209
fresh ground black pepper : 7054
carrots : 6846
bacon : 6802
chili powder : 6798
parsley : 6733
ground cinnamon : 6691
carrot : 6540
potatoes : 6318
nutmeg : 6098
cayenne pepper :

In [5]:
def ingredient_parse(ingredients_array):

    lemmatizer = nltk.WordNetLemmatizer()

    words_to_remove = [
        ',', ' ', "&", 'fresh', 'ground', 'red', 'black', 'green', 'brown', 'and', 'all', 'purpose',
        'of', 'dry', 'frozen', 'light', 'skinless', 'yellow', 'half', 'new', 'whole', 'and', 'of', 'clove',
        'prepared', "hard-cooked", "extract", "semi-sweet", "in", "water", "fillet", "fillets", "purple", "s"
    ]

    ordinary = [
        "salt", "pepper", "ice", "cube", "water"
    ]
    

    ingredient_list = []
    for items in ingredients_array:
        # Split by word
        items = items.split(" ")
        # Lowercase
        items = [word.lower() for word in items]
        # Clean symbols
        items = [word.strip('",') for word in items]
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]

        items = [word for word in items if word not in ordinary]

        if items:
            # print(items)
            ingredient_list.append(' '.join(items))
            

    return ingredient_list


In [6]:
ingredients_clean = ingredients_array.apply(ingredient_parse).values
labels = df["id"].values

In [7]:
ingredients_clean

array([list(['winter squash', 'mexican seasoning', 'mixed spice', 'honey', 'butter', 'olive oil']),
       list(['pizza crust', 'sausage patty', 'egg', 'milk', 'cheese']),
       list(['beef', 'onion', 'diced tomato', 'tomato paste', 'tomato soup', 'rotel tomato', 'kidney bean', 'chili powder', 'cumin', 'lettuce', 'cheddar cheese']),
       ...,
       list(['egg', 'mayonnaise', 'dijon mustard', 'salt-free cajun seasoning', 'tabasco sauce', 'italian parsley']),
       list(['butter', 'eagle brand condensed milk', 'sugar', 'sour cream', 'egg', 'nutmeg', 'self-rising flour', 'bisquick', 'wooden popsicle stick']),
       list(['granulated sugar', 'shortening', 'egg', 'flour', 'cream tartar', 'baking soda', 'vanilla'])],
      dtype=object)

In [8]:
def get_input():
    user_input = input("Masukkan bahan-bahan pisah dengan (, ) : ")
    return user_input.split(", ")

In [9]:
def filter_recipe(user_input):
    matched_recipes = []

    for ingre, label in zip(ingredients_clean, labels):
        if all(i in user_input for i in ingre):
            matched_recipes.append(label)

    return matched_recipes

In [10]:
# print(list(i in ingredients_clean[] for i in user_input))
# print(user_input)
# print(ingredients_clean[107699])
# tuple(zip(ingredients_clean, labels))

In [11]:
# user_input = get_input()
# reccomendation = filter_recipe(user_input)

# if reccomendation:
#     print("Resep rekomendasi :")
#     for i in reccomendation[0:10]:
#         recipe_name = df.loc[df["id"] == int(i)]["name"].item()
#         print(f"{recipe_name}, {i}")

# else:
#     print("Resep tidak ditemukan")
#     print(reccomendation)

In [15]:
json_dict = {}

for ingre, label in zip(ingredients_clean, labels):
    json_dict[str(label)] = ingre

with open("clean_recipes.json", "w") as outfile:
    json.dump(json_dict, outfile)