In [1]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [2]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [3]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [4]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [5]:
#ingredients
#ingredients.loc[ingredients['id'] == 9380]

In [6]:
#ingredients.loc[ingredients['ingredients_id'] == 2972]

In [7]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [8]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [9]:
nutrition_db.index.nunique()
ingredients_db.index.get_level_values('id').nunique()


4061

4061

In [10]:
nutritions['id']

0          59661
1          59661
2          59661
3          59661
4          59661
           ...  
126075    244188
126076    244188
126077    244188
126078    244188
126079    244188
Name: id, Length: 126080, dtype: object

In [11]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [12]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [199]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=20).copy()

subset_fat_id = subset_fat.index.to_numpy()


In [200]:
subset_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,217.48873,609.54474,330.09607,33.57968,137.72264,2.26831,36.67734,51.14367,3.01907,56.94197,15.69682,591.20307,35.80749,15.49787,1154.64603,11.38103,0.45136,1139.96369,0.64947,13.59643
std,193.58192,140.28993,8.78791,32.25752,49.3274,2.51251,0.97643,53.40865,1.99419,23.62967,7.98455,257.21866,12.21169,4.86165,657.63432,23.52717,0.49112,1333.9958,0.33071,19.10575
min,37.82,461.0381,316.3209,2.21908,60.04,0.0,35.14676,6.08,0.76233,29.78105,7.12813,273.2179,16.83116,6.11953,222.9983,0.09294,0.01793,193.2933,0.21252,0.07625
25%,69.79853,492.83783,322.35107,7.55662,100.19257,0.51576,35.81679,17.08777,1.74555,40.19062,10.46881,368.56598,26.99913,11.99563,755.47642,1.19676,0.11392,360.975,0.40588,2.41172
50%,127.44295,571.6166,331.6438,26.60258,153.47325,1.4591,36.84932,28.87129,2.85558,52.94833,13.57014,523.55045,34.1674,14.99472,1125.4505,2.80224,0.16894,795.4429,0.62013,7.30208
75%,312.28233,700.44155,336.81705,48.91836,171.81152,2.91781,37.42411,71.77647,3.71495,64.29415,19.16338,747.8693,43.95483,18.89772,1528.65325,9.71889,0.70234,1294.3105,0.74875,14.66976
max,708.432,905.355,345.8915,107.0733,212.2207,9.47687,38.43239,207.9386,9.3898,134.5539,42.04708,1193.162,69.14301,23.50516,3117.706,104.8181,1.68185,6331.443,1.4808,77.86107


In [202]:
subset_normal = nutrition_db2.sample(n=20).copy()
subset_normal_id = subset_normal.index.to_numpy()

In [204]:
subset_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,132.85827,385.95645,181.60761,23.69738,87.05815,1.97484,20.17862,43.98828,2.57912,49.2625,11.67466,505.01168,26.41286,7.33201,768.06263,5.24918,0.23131,1087.17806,0.46982,11.43423
std,116.35727,136.70025,101.96627,18.64182,46.63282,2.16963,11.32959,33.96273,1.28129,35.04222,5.54857,229.59552,10.14185,5.36463,415.56784,5.65418,0.21594,2089.23129,0.30043,13.94517
min,13.66963,150.1325,15.39,0.15506,0.0,0.0,1.71,5.14774,0.58782,12.79541,3.13676,177.8604,6.32874,0.342,142.6859,0.0,0.00055,3.43,0.10839,0.0
25%,44.95076,305.74935,130.19088,6.40123,50.04688,0.27648,14.46565,20.16587,1.61308,28.8357,7.75324,350.44062,18.77309,3.54748,398.08855,1.14867,0.06176,242.37103,0.30834,1.93228
50%,96.7511,399.5815,176.23205,23.20832,83.14183,1.33563,19.58134,32.21345,2.20039,41.13745,10.38527,468.1594,26.68554,7.13062,832.16775,3.29664,0.17167,402.27205,0.36199,4.37123
75%,202.83993,493.99887,218.97532,34.36651,118.775,2.95659,24.33059,61.33469,3.31753,57.03867,14.34324,597.54925,34.0194,10.01424,1069.1205,7.25609,0.2955,894.43508,0.6075,17.65577
max,456.7527,673.8499,399.3261,67.42516,180.5,7.22278,44.36957,126.1644,5.43513,168.1483,23.27805,993.4792,42.24757,20.5533,1722.049,21.40838,0.70105,9392.679,1.41892,41.84249


In [15]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [205]:
user_recipes_fat = recipe_db[recipe_db.index.isin(subset_fat_id)]
user_recipes_normal = recipe_db[recipe_db.index.isin(subset_normal_id)]

In [17]:
#top10 most common ingredients
#16421    2125 salt
#4342     1506 garlic
#4397     1412 onion
#16406    1185 ground black pepper
#16157    1016 butter
#6307      944 olive oil
#6494      786 skinless bone less chicken breast
#2496      749 water
#16238     574 grated Parmesan cheese
#16317     538 eggs


drop_id_list = [16421, 4342, 4397, 16406, 16157, 6307, 6494, 2496, 16238, 16317]





In [206]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)
new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=subset_fat_id)
new_recipe_db_wo_userrecipes_normal = new_recipe_db.drop(axis=0, labels=subset_normal_id)

In [19]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes_fat = user_recipes_fat.drop(axis=1, labels=drop_id_list)
new_user_recipes_normal = user_recipes_normal.drop(axis=1, labels=drop_id_list)

In [20]:
## Jaccard - recipe to recipe similarity without any removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_recipes_fat, recipe_db,'jaccard')
result_wo_filter = pd.DataFrame(result_array, columns=recipe_db.index.values, index=user_recipes_fat.index.values)
# result_2

result_wo_filter_10 = pd.DataFrame(result_wo_filter.sum(), columns=['jaccard_distance_sum'])
result_wo_filter_10 = result_wo_filter_10.sort_values(by='jaccard_distance_sum')
result_wo_filter_10['jaccard_distance_sum'] = result_wo_filter_10['jaccard_distance_sum'].div(20)
result_wo_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
158587,0.85511
71291,0.85877
70522,0.85896
11707,0.85911
11832,0.86056
17311,0.86069
114351,0.86194
52464,0.86364
132511,0.8643
34437,0.86482


In [21]:
pd.Series(np.intersect1d(id_list,result_wo_filter_10[0:10].index.to_numpy()))


0    114351
1    158587
dtype: int64

In [22]:
## Jaccard - recipe to recipe similarity without any removal of basis ingredients 

result_array = cdist(new_user_recipes_fat, new_recipe_db,'jaccard')
result_wo_basis_filter = pd.DataFrame(result_array, columns=new_recipe_db.index.values, index=new_user_recipes_fat.index.values)
# result_2

result_wo_basis_filter_10 = pd.DataFrame(result_wo_basis_filter.sum(), columns=['jaccard_distance_sum'])
result_wo_basis_filter_10 = result_wo_basis_filter_10.sort_values(by='jaccard_distance_sum')
result_wo_basis_filter_10['jaccard_distance_sum'] = result_wo_basis_filter_10['jaccard_distance_sum'].div(20)
result_wo_basis_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
16304,0.90398
158587,0.91643
177777,0.92115
42967,0.92134
114351,0.92276
216026,0.92367
234797,0.92391
35469,0.92466
8757,0.92613
8547,0.92613


In [23]:
result_wo_basis_filter_10[0:10].index.to_numpy()

array([ 16304, 158587, 177777,  42967, 114351, 216026, 234797,  35469,
         8757,   8547])

In [24]:
id_list

array([169974, 114351, 216026,  42967, 215616, 164208,  22931, 158587,
        26299,  16304, 169322,  38028, 177777, 222509, 234797,  86813,
        14668,  16682,  35469,   8757])

In [25]:
pd.Series(np.intersect1d(id_list,result_wo_basis_filter_10[0:10].index.to_numpy()))


0      8757
1     16304
2     35469
3     42967
4    114351
5    158587
6    177777
7    216026
8    234797
dtype: int64

In [191]:
## Jaccard - recipe to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(new_user_recipes_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=new_user_recipes_fat.index.values)
# result_2

result_w_filter_10 = pd.DataFrame(result_w_filter.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10 = result_w_filter_10.sort_values(by='jaccard_distance_sum')
result_w_filter_10['jaccard_distance_sum'] = result_w_filter_10['jaccard_distance_sum'].div(20)
result_w_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
8547,0.92613
22991,0.9264
230283,0.9284
14587,0.929
19301,0.93156
240619,0.93254
19324,0.933
180905,0.93562
161869,0.93796
19422,0.93818


In [186]:
result_w_filter_10

Unnamed: 0,jaccard_distance_sum
8547,0.92613
22991,0.92640
230283,0.92840
14587,0.92900
19301,0.93156
...,...
41393,1.00000
22538,1.00000
70096,1.00000
14551,1.00000


In [192]:

result_w_filter_10 = result_w_filter_10.reset_index()


In [193]:
result_w_filter_10.set_index('index', inplace=True)

In [194]:
result_w_filter_10

Unnamed: 0_level_0,jaccard_distance_sum
index,Unnamed: 1_level_1
8547,0.92613
22991,0.92640
230283,0.92840
14587,0.92900
19301,0.93156
...,...
41393,1.00000
22538,1.00000
70096,1.00000
14551,1.00000


In [195]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_10.index.get_level_values('index').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_jaccard_top_10 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_jaccard_top_10.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_jaccard_top_10.columns = nutrition_jaccard_top_10.columns.droplevel(0)

# entferne alle NA
nutrition_jaccard_top_10 = nutrition_jaccard_top_10.dropna()

In [196]:
nutrition_jaccard_top_10.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0
mean,146.95446,436.38406,205.41119,29.056,96.96383,2.59212,22.82347,60.27003,3.14749,51.31239,12.43405,531.27784,28.0334,8.70033,850.59829,6.72939,0.30341,1167.06599,0.50977,16.26599
std,159.43519,182.62195,125.79641,22.74143,60.47518,2.50767,13.97738,59.63294,2.44253,29.0608,6.89504,267.0716,12.01635,6.78157,644.31205,8.83209,0.30128,2326.37175,0.31239,24.36387
min,2.247,51.34326,1.323,0.0,0.0,0.0,0.147,0.0,0.19784,0.99745,0.03511,21.35641,0.7505,0.01992,5.20206,0.0,0.00055,0.0,0.00497,0.0
25%,41.76382,302.479,116.2168,10.22224,60.27,0.74399,12.91297,15.85232,1.81151,33.17641,7.39561,345.256,19.94853,3.61286,417.7735,1.76207,0.10497,235.95,0.28385,2.145
50%,82.90875,405.8992,182.5449,24.74984,84.80499,1.94179,20.28276,38.6275,2.76611,45.87583,11.14604,480.107,26.83928,7.11517,733.8654,3.89837,0.19721,578.6025,0.44459,6.65546
75%,192.758,539.7248,270.5533,43.23486,126.6336,3.6642,30.06147,87.71037,3.91886,63.1891,16.54115,652.9858,34.28136,12.0583,1091.809,7.83809,0.39568,1108.596,0.68273,19.5215
max,1264.326,1828.192,989.103,236.7205,708.85,27.83175,109.9003,519.5359,55.89075,590.3922,47.57578,1931.488,99.72639,59.31002,7322.289,104.8181,2.35457,38664.7,3.83736,270.4131


In [27]:
result_w_filter_10[0:40].index.to_numpy()

array([  8547,  22991, 230283,  14587,  19301, 240619,  19324, 180905,
       161869,  19422,  11832,  14604, 228680, 231808, 132511,  46813,
       214046,   8694,  52934,  18841,  12073,  14735,  17996, 216231,
       188706,  14614,  18349, 215680,  24162, 263813,  25877,  11937,
       232465,  18805,  23058, 195045,  31965, 150156,  19368, 140653])

In [28]:
id_list

array([169974, 114351, 216026,  42967, 215616, 164208,  22931, 158587,
        26299,  16304, 169322,  38028, 177777, 222509, 234797,  86813,
        14668,  16682,  35469,   8757])

In [29]:
pd.Series(np.intersect1d(id_list,result_w_filter_10[0:10].index.to_numpy()))


Series([], dtype: int64)

In [30]:
result_w_filter[14127]

8757     1.00000
14668    1.00000
16304    1.00000
16682    1.00000
22931    1.00000
26299    0.90000
35469    1.00000
38028    1.00000
42967    0.90909
86813    1.00000
114351   1.00000
158587   1.00000
164208   0.88889
169322   1.00000
169974   1.00000
177777   1.00000
215616   0.83333
216026   1.00000
222509   1.00000
234797   1.00000
Name: 14127, dtype: float64

In [31]:
test = result_w_filter_10.reset_index()

In [32]:
test.set_index('index', inplace=True)

In [33]:
df['C'] = np.arange(len(df))

In [34]:
test.loc[14127]

jaccard_distance_sum   0.97657
Name: 14127, dtype: float64

In [35]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector = new_user_recipes_fat.copy()
user_one_vector.loc['sum',:]= user_one_vector.sum(axis=0)
user_one_vector = user_one_vector.drop(axis=0, labels=subset_fat_id)
user_one_vector[user_one_vector > 0] = 1

In [36]:
#show number of ingredients
user_one_vector.apply(pd.value_counts).count(axis=1)

0.00000    616
1.00000    104
dtype: int64

In [37]:
## Jaccard - user vector to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_one_vector, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_vector = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=user_one_vector.index.values)


result_w_filter_vector_10 = pd.DataFrame(result_w_filter_vector.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10 = result_w_filter_vector_10.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10[0:20]


Unnamed: 0,jaccard_distance_sum
231396,0.88679
236805,0.88889
180905,0.8972
13905,0.90654
125658,0.91429
102831,0.91509
218982,0.91509
219166,0.91509
11921,0.91589
151153,0.91589


In [38]:
pd.Series(np.intersect1d(id_list, result_w_filter_vector_10[0:10].index.to_numpy()))


Series([], dtype: int64)

In [39]:
result_w_filter_vector_10[0:20].index.to_numpy()

array([231396, 236805, 180905,  13905, 125658, 102831, 218982, 219166,
        11921, 151153,   9005, 112206,  83083,  76373, 232465,  25877,
        18349, 228680,   8694, 233571])

In [40]:
id_list

array([169974, 114351, 216026,  42967, 215616, 164208,  22931, 158587,
        26299,  16304, 169322,  38028, 177777, 222509, 234797,  86813,
        14668,  16682,  35469,   8757])

In [41]:
recommenden_nut = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)

In [42]:
# 19–30 empfohlene nährwerte pro tag #https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
recommenden_nut = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)
recommenden_nut.loc['index'] = [
    # "Calcium":[1],
    333.3,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    43.3,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    8.3,
    # "Fat":[1],
    30,
    # "Folate":[1],
    133.3,
    # "Iron":[1],
    2.6,
    # "Magnesium":[1],
    133.3,
    # "Niacin Equivalents":[1],
    5.3,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    0,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    0,
    # "Thiamin":[1],
    0.4,
    # "Vitamin A - IU":[1],
    300,
    # "Vitamin B6":[1],
    0.43,
    # "Vitamin C":[1]}
    30]
recommenden_nut

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
index,333.3,800.0,0.0,43.3,0.0,8.3,30.0,133.3,2.6,133.3,5.3,1.0,18.6,0.0,0.5,0.0,0.4,300.0,0.43,30.0


In [43]:
result_w_filter_10[0:10].index.to_numpy()

array([  8547,  22991, 230283,  14587,  19301, 240619,  19324, 180905,
       161869,  19422])

In [44]:
id_list

array([169974, 114351, 216026,  42967, 215616, 164208,  22931, 158587,
        26299,  16304, 169322,  38028, 177777, 222509, 234797,  86813,
        14668,  16682,  35469,   8757])

In [45]:
new_ingredients

Unnamed: 0,id,ingredients_id,ingredients_name,ingredients_grams,ingredients_type
0,59661,16157,10 g butter,11.36000,Normal
1,59661,4405,40 g sliced green onions,41.80000,Normal
2,59661,4342,"1-1/2 cloves garlic, minced",4.80000,Normal
3,59661,4520,3/4 (10 ounce) package frozen chopped spinach ...,227.20000,Normal
4,59661,16243,180 g ricotta cheese,182.40001,Normal
...,...,...,...,...,...
36685,229659,16423,"1-3/4 sprigs fresh thyme, divided",0.66667,Normal
36686,229659,20270,"3/8 white onion, chopped - divided",36.66667,Normal
36687,229659,16157,"40 g butter, sliced into pats",37.83334,Normal
36688,229659,4292,80 g chopped fresh celery leaves,80.00000,Normal


In [46]:
new_ingredients.hist(column='ingredients_id', bins=700)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x121f8ee50>]],
      dtype=object)

In [47]:
asd = new_ingredients['ingredients_id'].value_counts()

In [48]:
asd

16421    2125
4342     1506
4397     1412
16406    1185
16157    1016
         ... 
23047       1
20486       1
18921       1
21368       1
20792       1
Name: ingredients_id, Length: 730, dtype: int64

In [49]:
result_w_filter_100 =result_w_filter_10[0:100].copy()

In [50]:
result_w_filter_100['pos'] = np.arange(len(result_w_filter_100))

In [51]:
result_w_filter_100 = result_w_filter_100.reset_index()

In [52]:
result_w_filter_100 = result_w_filter_100.set_index('index')

In [53]:
result_w_filter_100

Unnamed: 0_level_0,jaccard_distance_sum,pos
index,Unnamed: 1_level_1,Unnamed: 2_level_1
8547,0.92613,0
22991,0.92640,1
230283,0.92840,2
14587,0.92900,3
19301,0.93156,4
...,...,...
233571,0.95197,95
26260,0.95198,96
14636,0.95201,97
89721,0.95215,98


In [54]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_100.index.get_level_values('index').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db3 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db3.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db3.columns = nutrition_db3.columns.droplevel(0)

# entferne alle NA
nutrition_db3 = nutrition_db3.dropna()

In [138]:
# 19–30 empfohlene nährwerte pro tag #https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
recommenden_nut_low_fat = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)
recommenden_nut_low_fat.loc['index'] = [
    # "Calcium":[1],
    333.3,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    43.3,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    8.3,
    # "Fat":[1],
    1,
    # "Folate":[1],
    133.3,
    # "Iron":[1],
    2.6,
    # "Magnesium":[1],
    133.3,
    # "Niacin Equivalents":[1],
    5.3,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    0,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    0,
    # "Thiamin":[1],
    0.4,
    # "Vitamin A - IU":[1],
    300,
    # "Vitamin B6":[1],
    0.43,
    # "Vitamin C":[1]}
    30]
recommenden_nut

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
index,333.3,800.0,0.0,43.3,0.0,8.3,30.0,133.3,2.6,133.3,5.3,1.0,18.6,0.0,0.5,0.0,0.4,300.0,0.43,30.0


In [139]:
# 19–30 empfohlene nährwerte pro tag #https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
recommenden_nut_low_carbs = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)
recommenden_nut_low_carbs.loc['index'] = [
    # "Calcium":[1],
    333.3,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    8.3,
    # "Fat":[1],
    20,
    # "Folate":[1],
    133.3,
    # "Iron":[1],
    2.6,
    # "Magnesium":[1],
    133.3,
    # "Niacin Equivalents":[1],
    5.3,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    0,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    0,
    # "Thiamin":[1],
    0.4,
    # "Vitamin A - IU":[1],
    300,
    # "Vitamin B6":[1],
    0.43,
    # "Vitamin C":[1]}
    30]
recommenden_nut

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
index,333.3,800.0,0.0,43.3,0.0,8.3,30.0,133.3,2.6,133.3,5.3,1.0,18.6,0.0,0.5,0.0,0.4,300.0,0.43,30.0


In [140]:
# euklidische distanz für nährwerte
no_weight = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_fat = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    10000000,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_carbs = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1000000,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]


result_array = cdist(recommenden_nut_low_carbs, nutrition_db3, 'minkowski', p=2, w=weighted_carbs)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db3.index.values, index=recommenden_nut.index.values)

euclid_distance_sum_carbs = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_carbs = euclid_distance_sum_carbs.sort_values(by='euclid_distance_sum')
euclid_distance_sum_carbs[0:10]

Unnamed: 0,euclid_distance_sum
20100,3593.0898
229277,4526.79447
25877,8375.80445
8547,8421.21207
20606,9089.34335
11832,11261.16514
14614,11984.20615
52464,12082.38184
16602,13061.68059
105016,13540.12259


In [141]:
jaccard_euclid_joined_carbs = pd.merge(euclid_distance_sum_carbs, result_w_filter_100, left_index=True, right_index=True)

In [142]:
jaccard_euclid_joined_carbs[0:20]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
20100,3593.0898,0.95019,68
229277,4526.79447,0.95173,93
25877,8375.80445,0.94456,30
8547,8421.21207,0.92613,0
20606,9089.34335,0.94754,50
11832,11261.16514,0.93826,10
14614,11984.20615,0.94352,25
52464,12082.38184,0.94773,51
16602,13061.68059,0.95108,76
105016,13540.12259,0.9511,78


In [143]:
jaccard_euclid_joined_carbs[0:20].index.to_numpy()

array([ 20100, 229277,  25877,   8547,  20606,  11832,  14614,  52464,
        16602, 105016, 216914, 216902, 228450, 132511,  68380,  16978,
       231396,  21021,  84044,  26614])

In [144]:
id_list

array([169974, 114351, 216026,  42967, 215616, 164208,  22931, 158587,
        26299,  16304, 169322,  38028, 177777, 222509, 234797,  86813,
        14668,  16682,  35469,   8757])

In [145]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined_carbs[0:20].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top20_carbs = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top20_carbs.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top20_carbs.columns = nutrition_top20_carbs.columns.droplevel(0)

# entferne alle NA
nutrition_top20_carbs = nutrition_top20_carbs.dropna()

In [146]:
nutrition_top20_carbs.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,232.80897,441.21565,263.47568,14.89874,139.05943,1.52673,29.27507,38.51919,2.53864,40.96111,12.67663,457.2088,28.79743,14.75222,727.31931,3.609,0.20827,1441.3599,0.44272,20.75536
std,124.51105,126.92228,91.15115,5.41389,74.02676,1.20651,10.1279,19.9054,1.18942,11.69708,7.07195,152.81855,11.16186,6.25299,347.00075,2.97849,0.15671,912.48502,0.2542,36.78659
min,68.09885,216.952,103.7408,4.44503,16.256,0.2032,11.52676,13.06472,1.08367,21.07063,3.23102,166.8428,8.33184,3.67949,167.1442,0.35572,0.04025,331.4846,0.07274,0.33399
25%,157.43647,333.20508,187.88507,11.66502,80.9772,0.58755,20.87612,20.49061,1.78654,32.18264,8.08036,362.17585,22.20007,9.97146,542.963,1.58669,0.09246,807.48828,0.28054,2.90062
50%,196.0742,446.41015,248.8011,14.87917,140.73475,1.28332,27.64457,37.92956,2.10808,39.11562,10.27193,444.41395,27.50297,13.17364,665.5493,2.85883,0.14487,1283.48,0.35866,9.72693
75%,356.95022,524.49665,339.68033,19.42903,179.79188,1.79294,37.74226,50.27422,3.34453,49.9216,16.80057,589.2219,34.29183,20.28089,876.01245,4.87895,0.31885,1861.591,0.60429,20.85158
max,479.4408,641.8463,411.5434,22.26139,359.6714,4.7036,45.72704,75.14988,5.34475,64.3588,29.67555,680.962,50.00866,26.02578,1722.324,12.2315,0.49511,4513.288,1.12702,160.6108


In [147]:
nutrition_db.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0
mean,146.27411,436.40975,205.37237,29.33344,96.51821,2.63126,22.81915,60.7766,3.17214,51.61693,12.29695,537.39301,27.76348,8.69244,846.57719,6.72518,0.30159,1221.36958,0.50872,16.72761
std,159.44644,208.88694,142.06851,23.54763,64.77935,2.63839,15.78539,60.62238,2.60958,32.33175,7.29271,325.05535,13.36397,7.23535,736.5021,9.17572,0.30494,2557.63597,0.3445,26.92523
min,0.93181,13.63194,0.9275,0.0,0.0,0.0,0.10306,0.0,0.07193,0.85861,0.03511,4.66014,0.44604,0.01992,2.67762,0.0,0.00012,0.0,0.00252,0.0
25%,41.70671,293.5317,110.9611,10.17949,58.52,0.724,12.32901,15.83472,1.76227,31.96074,7.13499,330.8866,19.10435,3.46839,403.2487,1.702,0.10256,226.2493,0.26951,2.03857
50%,82.98421,402.2169,179.0905,24.7623,83.20487,1.90878,19.89894,39.23425,2.72961,45.60215,10.96189,474.423,26.44455,6.94486,719.3031,3.83038,0.19551,569.1794,0.437,6.49813
75%,191.935,543.941,270.1248,43.31408,126.135,3.67149,30.01386,88.15191,3.95205,63.72025,16.48777,660.3365,34.30157,11.9691,1084.226,7.8129,0.39492,1118.828,0.68543,19.57264
max,1264.326,4709.199,3455.29,236.7205,979.7761,27.83175,383.9211,519.5359,55.89075,590.3922,109.0173,6063.162,273.2163,103.4358,22099.37,136.5051,3.25879,40386.87,5.98334,507.2771


In [148]:
result_array = cdist(recommenden_nut_low_fat, nutrition_db3, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db3.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat = euclid_distance_sum_fat.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat[0:10]

Unnamed: 0,euclid_distance_sum
143078,20155.62579
21021,33346.83979
232465,42982.76626
18442,42993.88484
23717,44123.69806
23181,45190.38766
16998,49893.71803
195045,50635.35647
26614,51624.49039
52934,51795.66121


In [149]:
jaccard_euclid_joined_fat = pd.merge(euclid_distance_sum_fat, result_w_filter_100, left_index=True, right_index=True)

In [150]:
jaccard_euclid_joined_fat[0:20]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
143078,20155.62579,0.95131,83
21021,33346.83979,0.94893,59
232465,42982.76626,0.94491,32
18442,42993.88484,0.95116,79
23717,44123.69806,0.94617,42
23181,45190.38766,0.9467,43
16998,49893.71803,0.95138,85
195045,50635.35647,0.94525,35
26614,51624.49039,0.95154,89
52934,51795.66121,0.94168,18


In [151]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined_fat[0:20].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top20_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top20_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top20_fat.columns = nutrition_top20_fat.columns.droplevel(0)

# entferne alle NA
nutrition_top20_fat = nutrition_top20_fat.dropna()

In [152]:
nutrition_top20_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,225.63097,361.33828,149.482,33.49949,62.71316,3.25757,16.60911,87.13427,2.65907,48.03571,9.10122,486.26236,19.30083,7.68102,685.19848,6.80457,0.25233,1077.00305,0.3419,20.04686
std,124.08694,58.5463,26.97969,12.77928,24.93021,1.79744,2.99774,42.82209,0.92428,13.61981,2.75874,179.60648,5.70006,2.09446,255.1705,4.9154,0.21036,1096.15959,0.14174,36.87455
min,48.81821,216.952,66.31706,4.44503,11.68765,0.2032,7.36856,20.6948,1.26368,28.65933,4.33836,169.094,8.33184,2.56057,243.0307,0.38131,0.02461,126.4188,0.07887,0.10675
25%,118.1683,329.6886,136.81412,26.0169,51.34537,1.78807,15.20157,59.01698,1.90403,35.91155,7.56586,365.44502,16.05611,6.93771,530.09057,3.20384,0.10069,511.94425,0.25563,2.25269
50%,209.0004,368.177,156.5218,33.70431,64.68534,3.21772,17.39131,77.48428,2.48806,46.07156,9.15311,489.9811,20.9192,7.43219,650.95165,5.55564,0.18206,740.0479,0.31664,5.96437
75%,306.03388,394.4455,163.20397,44.3802,73.56875,4.52479,18.13377,112.6564,3.35078,62.30237,9.89506,622.57812,22.94034,8.59722,842.19102,8.83287,0.36099,1095.61675,0.45228,21.71645
max,508.5031,481.0773,181.2275,53.13089,124.241,6.77872,20.13639,211.4918,4.76909,73.49545,15.38974,829.5756,28.37642,11.73836,1183.618,17.42694,0.82351,4513.288,0.5821,160.6108


In [153]:
nutrition_db.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0
mean,146.27411,436.40975,205.37237,29.33344,96.51821,2.63126,22.81915,60.7766,3.17214,51.61693,12.29695,537.39301,27.76348,8.69244,846.57719,6.72518,0.30159,1221.36958,0.50872,16.72761
std,159.44644,208.88694,142.06851,23.54763,64.77935,2.63839,15.78539,60.62238,2.60958,32.33175,7.29271,325.05535,13.36397,7.23535,736.5021,9.17572,0.30494,2557.63597,0.3445,26.92523
min,0.93181,13.63194,0.9275,0.0,0.0,0.0,0.10306,0.0,0.07193,0.85861,0.03511,4.66014,0.44604,0.01992,2.67762,0.0,0.00012,0.0,0.00252,0.0
25%,41.70671,293.5317,110.9611,10.17949,58.52,0.724,12.32901,15.83472,1.76227,31.96074,7.13499,330.8866,19.10435,3.46839,403.2487,1.702,0.10256,226.2493,0.26951,2.03857
50%,82.98421,402.2169,179.0905,24.7623,83.20487,1.90878,19.89894,39.23425,2.72961,45.60215,10.96189,474.423,26.44455,6.94486,719.3031,3.83038,0.19551,569.1794,0.437,6.49813
75%,191.935,543.941,270.1248,43.31408,126.135,3.67149,30.01386,88.15191,3.95205,63.72025,16.48777,660.3365,34.30157,11.9691,1084.226,7.8129,0.39492,1118.828,0.68543,19.57264
max,1264.326,4709.199,3455.29,236.7205,979.7761,27.83175,383.9211,519.5359,55.89075,590.3922,109.0173,6063.162,273.2163,103.4358,22099.37,136.5051,3.25879,40386.87,5.98334,507.2771


In [165]:
result_array = cdist(recommenden_nut, nutrition_db3, 'minkowski', p=2, w=no_weight)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db3.index.values, index=recommenden_nut.index.values)

euclid_distance_sum = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum = euclid_distance_sum.sort_values(by='euclid_distance_sum')
euclid_distance_sum[0:10]

Unnamed: 0,euclid_distance_sum
150156,668.61959
16998,669.27187
19422,759.75811
19324,790.15793
143078,814.74933
18805,850.16739
26614,856.64922
14614,900.78376
216914,905.94805
52934,951.9759


In [166]:
jaccard_euclid_joined = pd.merge(euclid_distance_sum, result_w_filter_100, left_index=True, right_index=True)

In [167]:
jaccard_euclid_joined[0:20]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
150156,668.61959,0.94543,37
16998,669.27187,0.95138,85
19422,759.75811,0.93818,9
19324,790.15793,0.933,6
143078,814.74933,0.95131,83
18805,850.16739,0.94507,33
26614,856.64922,0.95154,89
14614,900.78376,0.94352,25
216914,905.94805,0.9481,55
52934,951.9759,0.94168,18


In [168]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined[0:20].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top20 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top20.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top20.columns = nutrition_top20.columns.droplevel(0)

# entferne alle NA
nutrition_top20 = nutrition_top20.dropna()

In [169]:
nutrition_top20.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,225.31266,404.44916,199.03767,31.1936,84.78608,1.91138,22.1153,90.08384,2.38563,42.22751,8.7,339.67471,20.19299,11.68777,516.23087,3.63598,0.30388,690.31136,0.24664,9.314
std,126.73302,76.19736,51.02366,13.32301,30.07009,1.34378,5.66929,46.21913,0.86197,12.8713,4.59963,133.87192,7.89784,3.73455,140.15621,2.17361,0.2106,206.84604,0.13487,16.18968
min,48.81821,293.6935,66.31706,4.44503,11.68765,0.2032,7.36856,15.40374,1.08367,25.38203,4.33836,166.8428,10.80376,2.56057,243.0307,0.38131,0.02461,209.2773,0.07887,0.10675
25%,118.1683,359.01662,166.28878,24.58361,68.66725,1.20913,18.47653,60.12996,1.61366,33.5404,6.32075,232.26857,16.83354,9.79424,441.01673,2.18473,0.16977,574.58067,0.12313,0.49434
50%,189.02045,398.61,202.5066,30.64144,82.88584,1.874,22.50073,89.00666,2.40153,40.51551,7.8839,324.6009,18.9371,12.13372,533.3879,3.35454,0.27855,732.3079,0.24021,2.33149
75%,337.68735,449.28827,230.84748,43.66485,99.82651,2.22018,25.64972,109.18187,3.05235,43.92696,9.69416,387.8232,22.14248,14.4559,603.8248,4.40843,0.37104,794.24555,0.31446,9.42497
max,485.1693,613.2035,269.2342,53.13089,149.345,6.77872,29.91492,211.4918,3.95681,79.57173,25.88058,587.751,49.29259,18.42387,754.2238,7.97937,0.82351,1090.203,0.5195,67.36604


In [170]:
nutrition_db.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0,4061.0
mean,146.27411,436.40975,205.37237,29.33344,96.51821,2.63126,22.81915,60.7766,3.17214,51.61693,12.29695,537.39301,27.76348,8.69244,846.57719,6.72518,0.30159,1221.36958,0.50872,16.72761
std,159.44644,208.88694,142.06851,23.54763,64.77935,2.63839,15.78539,60.62238,2.60958,32.33175,7.29271,325.05535,13.36397,7.23535,736.5021,9.17572,0.30494,2557.63597,0.3445,26.92523
min,0.93181,13.63194,0.9275,0.0,0.0,0.0,0.10306,0.0,0.07193,0.85861,0.03511,4.66014,0.44604,0.01992,2.67762,0.0,0.00012,0.0,0.00252,0.0
25%,41.70671,293.5317,110.9611,10.17949,58.52,0.724,12.32901,15.83472,1.76227,31.96074,7.13499,330.8866,19.10435,3.46839,403.2487,1.702,0.10256,226.2493,0.26951,2.03857
50%,82.98421,402.2169,179.0905,24.7623,83.20487,1.90878,19.89894,39.23425,2.72961,45.60215,10.96189,474.423,26.44455,6.94486,719.3031,3.83038,0.19551,569.1794,0.437,6.49813
75%,191.935,543.941,270.1248,43.31408,126.135,3.67149,30.01386,88.15191,3.95205,63.72025,16.48777,660.3365,34.30157,11.9691,1084.226,7.8129,0.39492,1118.828,0.68543,19.57264
max,1264.326,4709.199,3455.29,236.7205,979.7761,27.83175,383.9211,519.5359,55.89075,590.3922,109.0173,6063.162,273.2163,103.4358,22099.37,136.5051,3.25879,40386.87,5.98334,507.2771
