In [1]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [2]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [3]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [4]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [5]:
#ingredients
#ingredients.loc[ingredients['id'] == 9380]

In [6]:
#ingredients.loc[ingredients['ingredients_id'] == 2972]

In [7]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [8]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [9]:
nutrition_db.index.nunique()
ingredients_db.index.get_level_values('id').nunique()


4061

4061

In [10]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [11]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [12]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=20).copy()

subset_fat_20 = subset_fat.index.to_numpy()
id_list = subset_fat_20.copy()

In [13]:
subset_random = nutrition_db2.sample(n=10).copy()
subset_random = subset_random.index.to_numpy()

In [14]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [15]:
user_recipes = recipe_db[recipe_db.index.isin(id_list)]

In [16]:
drop_id_list = [4342, 6307, 6494, 16157, 16278, 16421, 4397]

#4342, 6307, 6494, 16157, 16278, 16421, 4397
#garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
#>= 5 mal vorkommen in der top 10 rezept liste

In [17]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)
new_recipe_db_wo_userrecipes = new_recipe_db.drop(axis=0, labels=subset_fat_20)


In [18]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes = user_recipes.drop(axis=1, labels=drop_id_list)

In [19]:
## Jaccard - recipe to recipe similarity without any removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_recipes, recipe_db,'jaccard')
result_wo_filter = pd.DataFrame(result_array, columns=recipe_db.index.values, index=user_recipes.index.values)
# result_2

result_wo_filter_10 = pd.DataFrame(result_wo_filter.sum(), columns=['jaccard_distance_sum'])
result_wo_filter_10 = result_wo_filter_10.sort_values(by='jaccard_distance_sum')
result_wo_filter_10['jaccard_distance_sum'] = result_wo_filter_10['jaccard_distance_sum'].div(20)
result_wo_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
8757,0.88032
8547,0.88032
8611,0.88048
132511,0.88148
102235,0.88192
70522,0.88408
16563,0.88486
64893,0.88709
35469,0.88801
216080,0.88881


In [20]:
pd.Series(np.intersect1d(id_list,result_wo_filter_10[0:10].index.to_numpy()))


0     8547
1    16563
2    35469
dtype: int64

In [21]:
## Jaccard - recipe to recipe similarity without any removal of basis ingredients 

result_array = cdist(new_user_recipes, new_recipe_db,'jaccard')
result_wo_basis_filter = pd.DataFrame(result_array, columns=new_recipe_db.index.values, index=new_user_recipes.index.values)
# result_2

result_wo_basis_filter_10 = pd.DataFrame(result_wo_basis_filter.sum(), columns=['jaccard_distance_sum'])
result_wo_basis_filter_10 = result_wo_basis_filter_10.sort_values(by='jaccard_distance_sum')
result_wo_basis_filter_10['jaccard_distance_sum'] = result_wo_basis_filter_10['jaccard_distance_sum'].div(20)
result_wo_basis_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
103489,0.89679
64893,0.90098
70051,0.90516
16563,0.90734
141125,0.90738
18798,0.90919
8547,0.91075
8757,0.91075
22158,0.91148
70680,0.91228


In [22]:
result_wo_basis_filter_10[0:10].index.to_numpy()

array([103489,  64893,  70051,  16563, 141125,  18798,   8547,   8757,
        22158,  70680])

In [23]:
id_list

array([ 69538, 103489,  59661, 235171,  16563,  31072,  84774, 141125,
        18397, 239896,   8547,  15127,  35469,  18093,  70051,  68484,
        18798,  14592, 221294, 212734])

In [24]:
pd.Series(np.intersect1d(id_list,result_wo_basis_filter_10[0:10].index.to_numpy()))


0      8547
1     16563
2     18798
3     70051
4    103489
5    141125
dtype: int64

In [25]:
## Jaccard - recipe to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(new_user_recipes, new_recipe_db_wo_userrecipes,'jaccard')
result_w_filter = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes.index.values, index=new_user_recipes.index.values)
# result_2

result_w_filter_10 = pd.DataFrame(result_w_filter.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10 = result_w_filter_10.sort_values(by='jaccard_distance_sum')
result_w_filter_10['jaccard_distance_sum'] = result_w_filter_10['jaccard_distance_sum'].div(20)
result_w_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
64893,0.90098
8757,0.91075
22158,0.91148
70680,0.91228
216080,0.91316
13883,0.91469
8975,0.91602
13981,0.91691
23611,0.91799
199688,0.91801


In [26]:
result_w_filter_10[0:10].index.to_numpy()

array([ 64893,   8757,  22158,  70680, 216080,  13883,   8975,  13981,
        23611, 199688])

In [34]:
id_list

array([ 69538, 103489,  59661, 235171,  16563,  31072,  84774, 141125,
        18397, 239896,   8547,  15127,  35469,  18093,  70051,  68484,
        18798,  14592, 221294, 212734])

In [27]:
pd.Series(np.intersect1d(id_list,result_w_filter_10[0:10].index.to_numpy()))


Series([], dtype: int64)

In [28]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector = new_user_recipes.copy()
user_one_vector.loc['sum',:]= user_one_vector.sum(axis=0)
user_one_vector = user_one_vector.drop(axis=0, labels=subset_fat_20)
user_one_vector[user_one_vector > 0] = 1

In [29]:
#show number of ingredients
user_one_vector.apply(pd.value_counts).count(axis=1)

0.00000    626
1.00000     97
dtype: int64

In [30]:
## Jaccard - user vector to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_one_vector, new_recipe_db_wo_userrecipes,'jaccard')
result_w_filter_vector = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes.index.values, index=user_one_vector.index.values)


result_w_filter_vector_10 = pd.DataFrame(result_w_filter_vector.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10 = result_w_filter_vector_10.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10[0:10]


Unnamed: 0,jaccard_distance_sum
231396,0.86869
14601,0.88776
199688,0.88776
236805,0.89216
223498,0.89796
87627,0.89899
11729,0.89899
31988,0.90099
229289,0.90196
24085,0.90196


In [31]:
pd.Series(np.intersect1d(id_list, result_w_filter_vector_10[0:10].index.to_numpy()))


Series([], dtype: int64)

In [32]:
result_w_filter_vector_10[0:10].index.to_numpy()

array([231396,  14601, 199688, 236805, 223498,  87627,  11729,  31988,
       229289,  24085])

In [33]:
id_list

array([ 69538, 103489,  59661, 235171,  16563,  31072,  84774, 141125,
        18397, 239896,   8547,  15127,  35469,  18093,  70051,  68484,
        18798,  14592, 221294, 212734])