In [32]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [33]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [34]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [35]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [36]:
#ingredients
#ingredients.loc[ingredients['id'] == 9380]

In [37]:
#ingredients.loc[ingredients['ingredients_id'] == 2972]

In [38]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [39]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [40]:
nutrition_db.index.nunique()
ingredients_db.index.get_level_values('id').nunique()


4061

4061

In [41]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [42]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()

In [43]:
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

In [44]:
Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

In [45]:
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

In [46]:
df_start_at_fivehundret.set_index('id', inplace=True)

In [47]:
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


In [48]:

# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)

In [136]:
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=20).copy()

In [50]:
subset_random = nutrition_db2.sample(n=10).copy()
subset_random = subset_random.index.to_numpy()

In [62]:
subset_random

array([161869,  16474, 111905,  69660, 132351,  61341,  47519,  14497,
       128968,  16581])

In [137]:
subset_fat_20 = subset_fat.index.to_numpy()

In [52]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [163]:
id_list = subset_fat_20.copy()

user_recipes = recipe_db[recipe_db.index.isin(id_list)]

In [54]:
drop_id_list = [4342, 6307, 6494, 16157, 16278, 16421, 4397]

#4342, 6307, 6494, 16157, 16278, 16421, 4397
#garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
#>= 5 mal vorkommen in der top 10 rezept liste

In [139]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes = user_recipes.drop(axis=1, labels=drop_id_list)

In [140]:
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)
new_recipe_db_wo_userrecipes = new_recipe_db.drop(axis=0, labels=subset_fat_20)


In [109]:
## Jaccard - rezept zu rezept vergelich ohne basiszutaten und ohne userzutaten in der rezept datenbank

result_array = cdist(new_user_recipes, new_recipe_db,'jaccard')
result_2 = pd.DataFrame(result_array, columns=new_recipe_db.index.values, index=new_user_recipes.index.values)
# result_2

new_result_10 = pd.DataFrame(result_2.sum(), columns=['jaccard_distance_sum'])
new_result_10 = new_result_10.sort_values(by='jaccard_distance_sum')

new_result_10[0:20]


Unnamed: 0,jaccard_distance_sum
19856,8.26955
167120,8.40714
213140,8.50021
16429,8.54398
87137,8.67106
17184,8.68784
180150,8.69188
70298,8.72045
216026,8.80833
42172,8.85151


In [110]:
new_result_10[0:10].index.to_numpy()

array([ 19856, 167120, 213140,  16429,  87137,  17184, 180150,  70298,
       216026,  42172])

In [130]:
id_list

array([ 16756,  14610, 132511,  75543, 151997, 237320, 147305,   8761,
        48873, 221294,  17184, 221304,  64311,   8679,   8639,  19484,
        35469,  74698,  38028,  82487])

In [129]:
new_user_recipes.index.to_numpy()

array([  8639,   8679,   8761,  14610,  16756,  17184,  19484,  35469,
        38028,  48873,  64311,  74698,  75543,  82487, 132511, 147305,
       151997, 221294, 221304, 237320])

In [141]:
## Jaccard - rezept zu rezept vergelich ohne basiszutaten und ohne userzutaten in der rezept datenbank

result_array = cdist(new_user_recipes, new_recipe_db_wo_userrecipes,'jaccard')
result_2 = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes.index.values, index=new_user_recipes.index.values)
# result_2

new_result_10 = pd.DataFrame(result_2.sum(), columns=['jaccard_distance_sum'])
new_result_10 = new_result_10.sort_values(by='jaccard_distance_sum')

new_result_10[0:10]


Unnamed: 0,jaccard_distance_sum
102235,18.22305
16801,18.3994
64893,18.3996
11760,18.46004
77276,18.46188
86047,18.48608
16563,18.50056
231248,18.50099
12807,18.5122
223360,18.52259


In [142]:
new_result_10[0:10].index.to_numpy()

array([102235,  16801,  64893,  11760,  77276,  86047,  16563, 231248,
        12807, 223360])

In [143]:
id_list

array([ 86415, 229277,  30007,  17184,  15024,  62459,  11688,  20312,
       127500,   8693, 147305,  16429,  17253,  14668, 142220,  64539,
       221304, 219766,   7198,  17022])

In [147]:
new_user_recipes

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14668,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15024,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16429,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17184,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17253,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [164]:
user_recipes.loc['Total',:]= user_recipes.sum(axis=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [168]:
user_recipes = user_recipes.reset_index()

In [177]:
user_recipes = user_recipes.set_index('id')

KeyError: "None of ['id'] are in the columns"

In [170]:
user_recipes.drop(columns='id')

Unnamed: 0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
subset_fat_20

array([ 86415, 229277,  30007,  17184,  15024,  62459,  11688,  20312,
       127500,   8693, 147305,  16429,  17253,  14668, 142220,  64539,
       221304, 219766,   7198,  17022])

In [180]:
user_recipes = user_recipes.drop(axis=0, labels=subset_fat_20)

In [188]:
user_recipes[user_recipes > 0] = 1

In [201]:
user_recipes.apply(pd.value_counts).count(axis=1)

0.00000    614
1.00000    116
dtype: int64

In [189]:
## Jaccard - rezept zu rezept vergelich

result_array = cdist(user_recipes, recipe_db,'jaccard')
result_2 = pd.DataFrame(result_array, columns=recipe_db.index.values, index=user_recipes.index.values)
# result_2

result_10 = pd.DataFrame(result_2.sum(), columns=['jaccard_distance_sum'])
result_10 = result_10.sort_values(by='jaccard_distance_sum')

result_10[0:20]


Unnamed: 0,jaccard_distance_sum
180905,0.85714
9005,0.87395
229289,0.87395
231396,0.87395
11688,0.87931
7198,0.87931
20312,0.87931
132511,0.88034
245362,0.88136
19856,0.88136


In [202]:
result_2

Unnamed: 0,7198,8493,8494,8495,8496,8497,8498,8500,8503,8506,...,254874,254940,255038,255263,255545,255936,257312,260193,261124,263813
Total,0.87931,0.93966,0.95902,0.95726,1.0,0.94828,0.9569,0.93277,0.96667,0.95798,...,0.95798,0.97479,0.89831,0.95763,0.91525,0.95041,0.95798,0.9916,0.93277,0.94118


In [190]:
result_10[:10].index.to_numpy()

array([180905,   9005, 229289, 231396,  11688,   7198,  20312, 132511,
       245362,  19856])

In [191]:
subset_fat.index.to_numpy()

array([ 86415, 229277,  30007,  17184,  15024,  62459,  11688,  20312,
       127500,   8693, 147305,  16429,  17253,  14668, 142220,  64539,
       221304, 219766,   7198,  17022])