In [None]:
import pandas as pd 
from itertools import combinations
from collections import Counter
import numpy as np

In [None]:
df = pd.read_pickle("Data/food.pkl")
df.head()



In [None]:
testing_df = df
testing_df = testing_df[['id', 'ingredients']]
testing_df

Unnamed: 0,id,ingredients
0,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,"[water, vegetable oil, wheat, salt]"
4,13162,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...
2231137,2231137,"[chocolate hazelnut spread Nutella, round whol..."
2231138,2231138,"[eggs, paprika, salt pepper liking, mustard, m..."
2231139,2231139,"[Daikon radish, Sesame oil, White sesame seeds..."
2231140,2231140,"[apple cider, sugar, kosher salt plus seasonin..."


In [43]:
def create_pairs(items):
    return list(combinations(items, 2))

In [44]:
df['ingredients'].dtype

dtype('O')

In [45]:
testing_df['pairs'] = testing_df['ingredients'].apply(create_pairs)
testing_df

Unnamed: 0,id,ingredients,pairs
0,10259,"[romaine lettuce, black olives, grape tomatoes...","[(romaine lettuce, black olives), (romaine let..."
1,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[(plain flour, ground pepper), (plain flour, s..."
2,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[(eggs, pepper), (eggs, salt), (eggs, mayonais..."
3,22213,"[water, vegetable oil, wheat, salt]","[(water, vegetable oil), (water, wheat), (wate..."
4,13162,"[black pepper, shallots, cornflour, cayenne pe...","[(black pepper, shallots), (black pepper, corn..."
...,...,...,...
2231137,2231137,"[chocolate hazelnut spread Nutella, round whol...","[(chocolate hazelnut spread Nutella, round who..."
2231138,2231138,"[eggs, paprika, salt pepper liking, mustard, m...","[(eggs, paprika), (eggs, salt pepper liking), ..."
2231139,2231139,"[Daikon radish, Sesame oil, White sesame seeds...","[(Daikon radish, Sesame oil), (Daikon radish, ..."
2231140,2231140,"[apple cider, sugar, kosher salt plus seasonin...","[(apple cider, sugar), (apple cider, kosher sa..."


In [46]:
pair_counts = Counter()
for pairs_list in testing_df['pairs']:
    pair_counts.update(pairs_list)
pair_counts

Counter({('sugar', 'salt'): 246718,
         ('salt', 'pepper'): 171238,
         ('flour', 'salt'): 168107,
         ('eggs', 'salt'): 144116,
         ('sugar', 'vanilla'): 143907,
         ('sugar', 'eggs'): 143819,
         ('onion', 'salt'): 140498,
         ('butter', 'salt'): 128095,
         ('sugar', 'flour'): 109925,
         ('salt', 'sugar'): 105723,
         ('water', 'salt'): 104639,
         ('salt', 'butter'): 97115,
         ('eggs', 'vanilla'): 95328,
         ('sugar', 'milk'): 94064,
         ('salt', 'eggs'): 90703,
         ('salt', 'milk'): 90367,
         ('baking soda', 'salt'): 88217,
         ('sugar', 'butter'): 87845,
         ('baking', 'salt'): 86815,
         ('olive oil', 'salt'): 82661,
         ('flour', 'sugar'): 82242,
         ('salt', 'water'): 79608,
         ('butter', 'sugar'): 79241,
         ('milk', 'salt'): 78539,
         ('eggs', 'milk'): 77996,
         ('eggs', 'flour'): 77462,
         ('eggs', 'sugar'): 76521,
         ('sugar', 'wate

In [47]:
unique_ingredients = set()
for ingredient in testing_df['ingredients']:
    unique_ingredients.update(ingredient)
unique_ingredients = sorted(list(unique_ingredients))

cooccurrence_matrix = np.zeros((len(unique_ingredients), len(unique_ingredients)))
ingredient_to_index = {ingredient: i for i, ingredient in enumerate(unique_ingredients)}
for ingredients in testing_df['ingredients']:
    for i, ingredient1 in enumerate(ingredients):
        for j, ingredient2 in enumerate(ingredients):
            if i != j:
                index1 = ingredient_to_index[ingredient1]
                index2 = ingredient_to_index[ingredient2]
                cooccurrence_matrix[index1, index2] += 1

cooccurrence_df = pd.DataFrame(cooccurrence_matrix, index=unique_ingredients, columns=unique_ingredients)
cooccurrence_df


MemoryError: Unable to allocate 16.4 TiB for an array with shape (1501671, 1501671) and data type float64

In [48]:
def recommend_ingredients(ingredient, pair_counts, top_n=5):
    recommendations = []
    for pair, count in pair_counts.items():
        if ingredient in pair:
            other_ingredient = pair[0] if pair[1] == ingredient else pair[1]
            recommendations.append((other_ingredient, count))
    recommendations.sort(key = lambda x: x[1], reverse = True)
    return [item[0] for item in recommendations[:top_n]]


In [52]:
recommend_ingredients("sugar", pair_counts)

['salt', 'vanilla', 'eggs', 'flour', 'salt']

In [None]:
words_to_drop = ["sugar", "salt", "flour", "water",]

In [53]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [55]:
te = TransactionEncoder()
te_ary = te.fit(df['ingredients']).transform(df['ingredients'], sparse = True)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df_encoded, min_support=0.5, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)

print(f"Frequent Itemsets:\n{frequent_itemsets}")
print(f"Association Rules:\n{rules}")


ValueError: Shape of passed values is (3064844, 1), indices imply (3064844, 1501671)

Counter({'salt': 997407,
         'sugar': 703192,
         'butter': 486953,
         'onion': 463288,
         'eggs': 452623,
         'water': 420109,
         'milk': 352706,
         'flour': 346610,
         'olive oil': 302906,
         'pepper': 232612,
         'vanilla': 228524,
         'brown sugar': 213072,
         'lemon juice': 196126,
         'garlic cloves': 194618,
         'egg': 189083,
         'garlic': 182238,
         'baking soda': 158679,
         'baking': 149975,
         'cinnamon': 145161,
         'sour cream': 135869,
         'tomatoes': 131511,
         'allpurpose flour': 127142,
         'vanilla extract': 124680,
         'onions': 123994,
         'parsley': 112052,
         'vegetable oil': 106161,
         'black pepper': 104262,
         'mayonnaise': 98573,
         'celery': 97849,
         'honey': 90633,
         'cream cheese': 89218,
         'ground black pepper': 89059,
         'oil': 88121,
         'pecans': 86547,
         'margar