In [169]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [170]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [171]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [172]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [173]:
#ingredients
#ingredients.loc[ingredients['id'] == 9380]

In [174]:
#ingredients.loc[ingredients['ingredients_id'] == 2972]

In [175]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [176]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [177]:
nutrition_db.index.nunique()
ingredients_db.index.get_level_values('id').nunique()


4062

4062

In [178]:
nutritions['id']

0          59661
1          59661
2          59661
3          59661
4          59661
           ...  
126095    244188
126096    244188
126097    244188
126098    244188
126099    244188
Name: id, Length: 126100, dtype: object

In [179]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [180]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [181]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=20).copy()

subset_fat_id = subset_fat.index.to_numpy()


In [182]:
subset_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,238.85851,625.23455,335.8076,34.41037,134.42824,2.63719,37.31196,58.05198,3.22893,57.73384,15.2423,551.43052,36.60062,14.468,1191.74457,9.91234,0.33895,816.09034,0.60973,9.6509
std,229.09583,139.53321,14.33978,30.38971,61.99192,2.1295,1.59331,57.107,1.47966,22.12646,7.76502,259.19207,17.22302,6.00894,916.43903,23.03661,0.29711,625.71962,0.38016,11.7867
min,22.25,390.7198,316.4884,1.23348,41.7375,0.01267,35.16538,6.7225,0.58999,21.99075,6.50392,226.6823,14.9787,5.39091,309.4594,0.0975,0.00254,61.2,0.1825,0.88163
25%,61.87009,533.80088,324.18075,9.90685,93.04813,0.63773,36.02009,14.65907,2.46838,38.51588,9.18962,449.9124,25.0084,10.09683,664.2517,1.3235,0.12001,305.90217,0.37202,2.61114
50%,161.4976,615.1043,330.5914,28.28986,122.174,2.3638,36.73237,26.86872,3.32006,56.32185,13.9702,488.49385,34.5801,13.59444,1026.8765,2.78702,0.26079,662.47605,0.5808,4.95239
75%,369.3673,733.23633,352.4974,51.19213,172.8166,4.19259,39.16638,93.46076,4.31244,74.20366,18.06179,552.63028,43.03869,19.68914,1466.90925,7.78525,0.49669,1231.33675,0.74062,10.38401
max,739.7261,905.355,358.7781,107.0733,308.72,6.80708,39.86424,196.676,6.63845,104.4998,35.70167,1288.867,93.23131,23.07542,4604.161,104.8181,1.17043,2343.181,1.90431,42.96725


In [183]:
subset_normal = nutrition_db2.sample(n=20).copy()
subset_normal_id = subset_normal.index.to_numpy()

In [184]:
subset_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,211.4957,518.76626,257.05309,35.74187,103.2764,3.45189,28.56145,69.49802,3.11789,52.47289,11.18658,552.52021,29.42957,11.50726,1133.67803,5.96288,0.3534,716.61142,0.52584,18.07291
std,220.02455,213.42576,167.55857,21.26642,54.01567,3.31721,18.61762,56.58976,1.83945,17.5375,5.67367,216.41205,11.04587,8.77867,943.729,8.75286,0.30062,666.43498,0.23922,31.44758
min,14.10812,220.5243,77.72115,4.54504,12.65,0.02917,8.63568,7.24333,0.44957,20.3225,3.82888,279.605,9.18529,2.59906,69.17555,1.27184,0.07251,56.75,0.11364,0.0
25%,48.99,359.18928,131.50088,19.47196,63.39833,1.06579,14.61121,28.112,1.67255,40.74167,6.59979,405.93295,20.86687,4.26439,409.81808,2.36435,0.15009,248.20717,0.32689,2.25207
50%,115.43066,528.56145,213.477,36.50668,97.7345,2.3362,23.71967,51.35306,3.00686,49.57259,11.32422,509.0093,30.23345,10.82787,885.47545,3.28783,0.26763,524.4127,0.4992,10.02415
75%,301.5083,609.95087,343.60573,49.52168,161.39535,4.50656,38.17842,103.08928,4.19786,63.76328,13.74992,678.03205,36.50952,13.92908,1589.35325,4.66415,0.41635,865.06067,0.73401,19.5467
max,752.4813,1095.748,770.4617,82.3971,178.7112,12.04811,85.60685,196.8294,8.64466,83.73882,24.97207,1152.394,49.44593,36.23195,3755.903,41.019,1.11606,2616.355,0.86662,142.9425


In [567]:
nutrition_db.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0
mean,146.2483,436.37865,205.34702,29.33343,96.53694,2.63156,22.81634,60.76312,3.1722,51.61716,12.29576,537.34508,27.76217,8.69277,846.59716,6.72692,0.30153,1221.15249,0.50863,16.72403
std,159.43529,208.87063,142.06021,23.54473,64.78237,2.63814,15.78447,60.62101,2.60926,32.32778,7.29221,325.02968,13.36258,7.23449,736.41251,9.17527,0.30493,2557.35848,0.34451,26.92288
min,0.93181,13.63194,0.9275,0.0,0.0,0.0,0.10306,0.0,0.07193,0.85861,0.03511,4.66014,0.44604,0.01992,2.67762,0.0,0.00012,0.0,0.00252,0.0
25%,41.70011,293.54442,110.92578,10.18247,58.52,0.72435,12.32509,15.83209,1.76229,31.96466,7.13552,330.8917,19.1071,3.47082,403.2909,1.70223,0.10243,226.36468,0.2693,2.03935
50%,82.97927,402.20005,179.08945,24.76714,83.21569,1.90935,19.89883,39.22277,2.7301,45.61357,10.96068,474.2425,26.43599,6.94796,719.31585,3.83273,0.19551,569.14505,0.43692,6.48954
75%,191.84663,543.9298,270.0743,43.31172,126.23355,3.67162,30.00825,88.13897,3.9511,63.71924,16.48592,660.26665,34.29652,11.96499,1084.2095,7.81531,0.3949,1118.612,0.68542,19.56792
max,1264.326,4709.199,3455.29,236.7205,979.7761,27.83175,383.9211,519.5359,55.89075,590.3922,109.0173,6063.162,273.2163,103.4358,22099.37,136.5051,3.25879,40386.87,5.98334,507.2771


In [566]:
nutrition_db2.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0
mean,147.62368,437.40977,206.09237,29.13842,97.12707,2.59778,22.89915,60.51086,3.15299,51.41984,12.44257,532.01939,28.05297,8.74708,851.17479,6.72738,0.30421,1168.79318,0.50982,16.28622
std,159.9925,183.07685,125.83504,22.8248,60.47892,2.50979,13.98167,59.87557,2.4433,29.10865,6.89345,267.47451,12.01118,6.79965,643.53732,8.81384,0.30164,2321.84247,0.3121,24.35803
min,2.247,51.34326,1.323,0.0,0.0,0.0,0.147,0.0,0.19784,0.99745,0.03511,21.35641,0.7505,0.01992,5.20206,0.0,0.00055,0.0,0.00497,0.0
25%,41.8956,302.9801,116.53312,10.22884,60.3915,0.74548,12.94812,15.88195,1.81476,33.2288,7.403,345.4553,19.97516,3.62944,418.67972,1.76526,0.1051,236.94223,0.28412,2.15411
50%,83.1664,407.19045,182.7579,24.7857,85.15604,1.94681,20.30644,38.70213,2.76656,45.931,11.15695,480.7348,26.86586,7.16939,734.111,3.90747,0.19759,581.0319,0.44613,6.66003
75%,193.59578,541.44173,271.86345,43.25593,126.76047,3.66756,30.20705,88.07055,3.93232,63.26032,16.5604,654.2057,34.31604,12.13675,1092.85675,7.83992,0.39693,1113.542,0.68317,19.56792
max,1264.326,1828.192,989.103,236.7205,708.85,27.83175,109.9003,519.5359,55.89075,590.3922,47.57578,1931.488,99.72639,59.31002,7322.289,104.8181,2.35457,38664.7,3.83736,270.4131


In [185]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [186]:
user_recipes_fat = recipe_db[recipe_db.index.isin(subset_fat_id)]
user_recipes_normal = recipe_db[recipe_db.index.isin(subset_normal_id)]

In [187]:
#top10 most common ingredients
#16421    2125 salt
#4342     1506 garlic
#4397     1412 onion
#16406    1185 ground black pepper
#16157    1016 butter
#6307      944 olive oil
#6494      786 skinless bone less chicken breast
#2496      749 water
#16238     574 grated Parmesan cheese
#16317     538 eggs


drop_id_list = [16421, 4342, 4397, 16406, 16157, 6307, 6494, 2496, 16238, 16317]





In [188]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)
new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=subset_fat_id)
new_recipe_db_wo_userrecipes_normal = new_recipe_db.drop(axis=0, labels=subset_normal_id)

In [189]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes_fat = user_recipes_fat.drop(axis=1, labels=drop_id_list)
new_user_recipes_normal = user_recipes_normal.drop(axis=1, labels=drop_id_list)

In [190]:
## Jaccard - recipe to recipe similarity without any removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_recipes_fat, recipe_db,'jaccard')
result_wo_filter = pd.DataFrame(result_array, columns=recipe_db.index.values, index=user_recipes_fat.index.values)
# result_2

result_wo_filter_10 = pd.DataFrame(result_wo_filter.sum(), columns=['jaccard_distance_sum'])
result_wo_filter_10 = result_wo_filter_10.sort_values(by='jaccard_distance_sum')
result_wo_filter_10['jaccard_distance_sum'] = result_wo_filter_10['jaccard_distance_sum'].div(20)
result_wo_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
83302,0.87908
51653,0.87945
158587,0.88098
218344,0.88406
15196,0.88437
25927,0.88587
189590,0.88605
8598,0.88905
20100,0.8892
71291,0.88981


In [191]:
pd.Series(np.intersect1d(subset_fat_id, result_wo_filter_10[0:10].index.to_numpy()))


0     51653
1    158587
dtype: int64

In [192]:
## Jaccard - recipe to recipe similarity without any removal of basis ingredients 

result_array = cdist(new_user_recipes_fat, new_recipe_db,'jaccard')
result_wo_basis_filter = pd.DataFrame(result_array, columns=new_recipe_db.index.values, index=new_user_recipes_fat.index.values)
# result_2

result_wo_basis_filter_10 = pd.DataFrame(result_wo_basis_filter.sum(), columns=['jaccard_distance_sum'])
result_wo_basis_filter_10 = result_wo_basis_filter_10.sort_values(by='jaccard_distance_sum')
result_wo_basis_filter_10['jaccard_distance_sum'] = result_wo_basis_filter_10['jaccard_distance_sum'].div(20)
result_wo_basis_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
51653,0.90715
15127,0.91631
158587,0.91867
229885,0.92472
22286,0.92722
230283,0.92733
161869,0.92779
218344,0.92846
23157,0.92962
235171,0.93086


In [193]:
result_wo_basis_filter_10[0:10].index.to_numpy()

array([ 51653,  15127, 158587, 229885,  22286, 230283, 161869, 218344,
        23157, 235171])

In [253]:
subset_fat_id

array([  8630,  62706, 229885, 228431,   8639,  42247, 233661, 158587,
        22286, 231233,  76763, 236103,  51653,   8549,  69538,  23157,
        11892, 180150, 235171,  15127])

In [254]:
pd.Series(np.intersect1d(subset_fat_id, result_wo_basis_filter_10[0:10].index.to_numpy()))


0     15127
1     22286
2     23157
3     51653
4    158587
5    229885
6    235171
dtype: int64

In [255]:
## Jaccard - recipe to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(new_user_recipes_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=new_user_recipes_fat.index.values)
# result_2

result_w_filter_10 = pd.DataFrame(result_w_filter.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10 = result_w_filter_10.sort_values(by='jaccard_distance_sum')
result_w_filter_10['jaccard_distance_sum'] = result_w_filter_10['jaccard_distance_sum'].div(20)
result_w_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
230283,0.92733
161869,0.92779
218344,0.92846
19422,0.93188
18349,0.93435
83302,0.93692
20100,0.93694
11937,0.93709
46813,0.93737
16998,0.93739


In [256]:
result_w_filter_10

Unnamed: 0,jaccard_distance_sum
230283,0.92733
161869,0.92779
218344,0.92846
19422,0.93188
18349,0.93435
...,...
25860,1.00000
230901,1.00000
87934,1.00000
87053,1.00000


In [257]:

result_w_filter_10 = result_w_filter_10.reset_index()


In [258]:
result_w_filter_10.set_index('index', inplace=True)

In [259]:
result_w_filter_10

Unnamed: 0_level_0,jaccard_distance_sum
index,Unnamed: 1_level_1
230283,0.92733
161869,0.92779
218344,0.92846
19422,0.93188
18349,0.93435
...,...
25860,1.00000
230901,1.00000
87934,1.00000
87053,1.00000


In [260]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_10.index.get_level_values('index').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_jaccard_top_10 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_jaccard_top_10.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_jaccard_top_10.columns = nutrition_jaccard_top_10.columns.droplevel(0)

# entferne alle NA
nutrition_jaccard_top_10 = nutrition_jaccard_top_10.dropna()

In [261]:
nutrition_jaccard_top_10.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0,3750.0
mean,147.13709,436.40804,205.40056,29.1103,96.92813,2.59757,22.82228,60.52398,3.15258,51.38616,12.42764,531.91587,28.00739,8.71657,849.35842,6.71039,0.30402,1170.67426,0.50929,16.32161
std,159.44754,182.77873,125.80797,22.77984,60.41749,2.51191,13.97866,59.89698,2.44753,29.13999,6.88662,267.54772,11.96424,6.79142,641.46015,8.68068,0.30169,2327.45793,0.31167,24.40366
min,2.247,51.34326,1.323,0.0,0.0,0.0,0.147,0.0,0.19784,0.99745,0.03511,21.35641,0.7505,0.01992,5.20206,0.0,0.00055,0.0,0.00497,0.0
25%,41.85553,302.48205,116.14998,10.23955,60.2775,0.74599,12.90555,15.8991,1.81333,33.18473,7.39559,345.42128,19.9466,3.61431,418.04737,1.76712,0.105,236.94223,0.28369,2.14741
50%,83.04989,405.9059,182.437,24.76714,84.87916,1.94505,20.27078,38.71169,2.76524,45.89463,11.14604,480.5968,26.81967,7.14019,732.97435,3.91375,0.19736,581.0319,0.4454,6.68777
75%,193.0611,539.28337,270.51977,43.24507,126.63,3.66432,30.05775,88.07055,3.92674,63.21386,16.52695,654.43387,34.25542,12.10183,1090.45625,7.83992,0.3968,1112.92,0.68266,19.57767
max,1264.326,1828.192,989.103,236.7205,708.85,27.83175,109.9003,519.5359,55.89075,590.3922,47.57578,1931.488,99.72639,59.31002,7322.289,102.0448,2.35457,38664.7,3.83736,270.4131


In [262]:
result_w_filter_10[0:40].index.to_numpy()

array([230283, 161869, 218344,  19422,  18349,  83302,  20100,  11937,
        46813,  16998, 238691,  11735,   8663, 221227,  18805, 214502,
       219166, 180905,  12016, 102393, 196428,  23058,  23105,   9043,
        42919,  15196,  15530,  70012,  18795,  11962, 178809, 263813,
        74142,  86069, 228367, 245764, 236703,  15375,  20456,   9000])

In [312]:
subset_fat_id

array([  8630,  62706, 229885, 228431,   8639,  42247, 233661, 158587,
        22286, 231233,  76763, 236103,  51653,   8549,  69538,  23157,
        11892, 180150, 235171,  15127])

In [361]:
pd.Series(np.intersect1d(subset_fat_id,result_w_filter_10[0:10].index.to_numpy()))


Series([], dtype: int64)

In [569]:
result_w_filter[166992]

8549     1.00000
8630     0.90909
8639     1.00000
11892    1.00000
15127    0.88889
22286    0.92857
23157    0.83333
42247    1.00000
51653    0.85714
62706    1.00000
69538    1.00000
76763    1.00000
158587   0.84615
180150   1.00000
228431   1.00000
229885   0.88889
231233   0.91667
233661   1.00000
235171   1.00000
236103   1.00000
Name: 166992, dtype: float64

In [363]:
test = result_w_filter_10.reset_index()

In [364]:
test.set_index('index', inplace=True)

In [365]:
df['C'] = np.arange(len(df))

In [366]:
test.loc[14127]

jaccard_distance_sum   0.98583
Name: 14127, dtype: float64

In [367]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector = new_user_recipes_fat.copy()
user_one_vector.loc['sum',:]= user_one_vector.sum(axis=0)
user_one_vector = user_one_vector.drop(axis=0, labels=subset_fat_id)
user_one_vector[user_one_vector > 0] = 1

In [368]:
#show number of ingredients
user_one_vector.apply(pd.value_counts).count(axis=1)

0.00000    613
1.00000    107
dtype: int64

In [369]:
## Jaccard - user vector to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_one_vector, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_vector = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=user_one_vector.index.values)


result_w_filter_vector_10 = pd.DataFrame(result_w_filter_vector.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10 = result_w_filter_vector_10.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10[0:20]


Unnamed: 0,jaccard_distance_sum
180905,0.9
218344,0.90654
9005,0.90909
111905,0.91071
230283,0.91589
125658,0.91667
23105,0.91667
186691,0.91743
234797,0.91964
112206,0.91964


In [570]:
result_w_filter_vector

Unnamed: 0,7198,8493,8494,8495,8496,8497,8498,8500,8503,8506,...,254874,254940,255038,255263,255545,255936,257312,260193,261124,263813
sum,0.9823,0.97222,0.9646,0.98148,0.99074,0.97222,0.97222,0.9537,0.99107,0.95413,...,0.98165,0.98165,0.93694,0.96296,0.94595,0.9823,0.98182,0.99083,0.9537,0.94495


In [409]:
pd.Series(np.intersect1d(subset_fat_id, result_w_filter_vector_10[0:10].index.to_numpy()))


Series([], dtype: int64)

In [410]:
result_w_filter_vector_10[0:20].index.to_numpy()

array([180905, 218344,   9005, 111905, 230283, 125658,  23105, 186691,
       234797, 112206, 231396, 236805,  76373,  15375,  22991,  68380,
       239230, 137576,  26656,  49374])

In [448]:
subset_fat_id

array([  8630,  62706, 229885, 228431,   8639,  42247, 233661, 158587,
        22286, 231233,  76763, 236103,  51653,   8549,  69538,  23157,
        11892, 180150, 235171,  15127])

In [449]:
recommenden_nut = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)

In [450]:
# 19–30 empfohlene nährwerte pro tag #https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
recommenden_nut = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)
recommenden_nut.loc['index'] = [
    # "Calcium":[1],
    333.3,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    43.3,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    8.3,
    # "Fat":[1],
    30,
    # "Folate":[1],
    133.3,
    # "Iron":[1],
    2.6,
    # "Magnesium":[1],
    133.3,
    # "Niacin Equivalents":[1],
    5.3,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    0,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    0,
    # "Thiamin":[1],
    0.4,
    # "Vitamin A - IU":[1],
    300,
    # "Vitamin B6":[1],
    0.43,
    # "Vitamin C":[1]}
    30]
recommenden_nut

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
index,333.3,800.0,0.0,43.3,0.0,8.3,30.0,133.3,2.6,133.3,5.3,1.0,18.6,0.0,0.5,0.0,0.4,300.0,0.43,30.0


In [451]:
result_w_filter_10[0:10].index.to_numpy()

array([230283, 161869, 218344,  19422,  18349,  83302,  20100,  11937,
        46813,  16998])

In [485]:
subset_fat_id

array([  8630,  62706, 229885, 228431,   8639,  42247, 233661, 158587,
        22286, 231233,  76763, 236103,  51653,   8549,  69538,  23157,
        11892, 180150, 235171,  15127])

In [486]:
new_ingredients

Unnamed: 0,id,ingredients_id,ingredients_name,ingredients_grams,ingredients_type
0,59661,16157,10 g butter,11.36000,Normal
1,59661,4405,40 g sliced green onions,41.80000,Normal
2,59661,4342,"1-1/2 cloves garlic, minced",4.80000,Normal
3,59661,4520,3/4 (10 ounce) package frozen chopped spinach ...,227.20000,Normal
4,59661,16243,180 g ricotta cheese,182.40001,Normal
...,...,...,...,...,...
36691,229659,16423,"1-3/4 sprigs fresh thyme, divided",0.66667,Normal
36692,229659,20270,"3/8 white onion, chopped - divided",36.66667,Normal
36693,229659,16157,"40 g butter, sliced into pats",37.83334,Normal
36694,229659,4292,80 g chopped fresh celery leaves,80.00000,Normal


In [518]:
new_ingredients.hist(column='ingredients_id', bins=700)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f9f0f1f7a50>]],
      dtype=object)

In [519]:
asd = new_ingredients['ingredients_id'].value_counts()

In [520]:
asd

16421    2126
4342     1506
4397     1412
16406    1185
16157    1016
         ... 
23047       1
20486       1
18921       1
21368       1
20792       1
Name: ingredients_id, Length: 730, dtype: int64

In [521]:
result_w_filter_100 = result_w_filter_10[0:100].copy()

In [522]:
result_w_filter_100['pos'] = np.arange(len(result_w_filter_100))

In [523]:
result_w_filter_100 = result_w_filter_100.reset_index()

In [524]:
result_w_filter_100 = result_w_filter_100.set_index('index')

In [525]:
result_w_filter_100

Unnamed: 0_level_0,jaccard_distance_sum,pos
index,Unnamed: 1_level_1,Unnamed: 2_level_1
230283,0.92733,0
161869,0.92779,1
218344,0.92846,2
19422,0.93188,3
18349,0.93435,4
...,...,...
15509,0.95621,95
86047,0.95668,96
16563,0.95673,97
151153,0.95674,98


In [526]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_100.index.get_level_values('index').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db3 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db3.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db3.columns = nutrition_db3.columns.droplevel(0)

# entferne alle NA
nutrition_db3 = nutrition_db3.dropna()

In [527]:
# 19–30 empfohlene nährwerte pro tag #https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
recommenden_nut_low_fat = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)
recommenden_nut_low_fat.loc['index'] = [
    # "Calcium":[1],
    333.3,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    43.3,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    8.3,
    # "Fat":[1],
    1,
    # "Folate":[1],
    133.3,
    # "Iron":[1],
    2.6,
    # "Magnesium":[1],
    133.3,
    # "Niacin Equivalents":[1],
    5.3,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    0,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    0,
    # "Thiamin":[1],
    0.4,
    # "Vitamin A - IU":[1],
    300,
    # "Vitamin B6":[1],
    0.43,
    # "Vitamin C":[1]}
    30]
recommenden_nut

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
index,333.3,800.0,0.0,43.3,0.0,8.3,30.0,133.3,2.6,133.3,5.3,1.0,18.6,0.0,0.5,0.0,0.4,300.0,0.43,30.0


In [528]:
# 19–30 empfohlene nährwerte pro tag #https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
recommenden_nut_low_carbs = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)
recommenden_nut_low_carbs.loc['index'] = [
    # "Calcium":[1],
    333.3,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    8.3,
    # "Fat":[1],
    20,
    # "Folate":[1],
    133.3,
    # "Iron":[1],
    2.6,
    # "Magnesium":[1],
    133.3,
    # "Niacin Equivalents":[1],
    5.3,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    0,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    0,
    # "Thiamin":[1],
    0.4,
    # "Vitamin A - IU":[1],
    300,
    # "Vitamin B6":[1],
    0.43,
    # "Vitamin C":[1]}
    30]
recommenden_nut

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
index,333.3,800.0,0.0,43.3,0.0,8.3,30.0,133.3,2.6,133.3,5.3,1.0,18.6,0.0,0.5,0.0,0.4,300.0,0.43,30.0


In [529]:
# euklidische distanz für nährwerte
no_weight = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_fat = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    10000000,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_carbs = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1000000,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]


result_array = cdist(recommenden_nut_low_carbs, nutrition_db3, 'minkowski', p=2, w=weighted_carbs)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db3.index.values, index=recommenden_nut.index.values)

euclid_distance_sum_carbs = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_carbs = euclid_distance_sum_carbs.sort_values(by='euclid_distance_sum')
euclid_distance_sum_carbs[0:10]

Unnamed: 0,euclid_distance_sum
219911,1864.80522
20100,3593.0898
229277,4526.79447
166992,5692.98398
239230,6924.10905
173906,7316.65939
228367,7980.20119
15196,8996.04468
38276,9336.40822
15530,9848.96357


In [530]:
jaccard_euclid_joined_carbs = pd.merge(euclid_distance_sum_carbs, result_w_filter_100, left_index=True, right_index=True)

In [531]:
jaccard_euclid_joined_carbs[0:20]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
219911,1864.80522,0.95598,90
20100,3593.0898,0.93694,6
229277,4526.79447,0.9555,79
166992,5692.98398,0.95344,63
239230,6924.10905,0.95219,56
173906,7316.65939,0.95612,93
228367,7980.20119,0.94768,34
15196,8996.04468,0.94512,25
38276,9336.40822,0.95331,62
15530,9848.96357,0.94587,26


In [532]:
jaccard_euclid_joined_carbs[0:20].index.to_numpy()

array([219911,  20100, 229277, 166992, 239230, 173906, 228367,  15196,
        38276,  15530,  52310, 228134,  18795, 146125,  42919,  19496,
        86168,   8635,  20747,  83083])

In [549]:
subset_fat_id

array([  8630,  62706, 229885, 228431,   8639,  42247, 233661, 158587,
        22286, 231233,  76763, 236103,  51653,   8549,  69538,  23157,
        11892, 180150, 235171,  15127])

In [550]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined_carbs[0:20].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top20_carbs = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top20_carbs.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top20_carbs.columns = nutrition_top20_carbs.columns.droplevel(0)

# entferne alle NA
nutrition_top20_carbs = nutrition_top20_carbs.dropna()

In [551]:
nutrition_top20_carbs.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,204.86523,410.79177,253.77652,10.9787,125.20405,1.19942,28.19739,40.16242,2.07384,42.24196,11.69192,451.45138,27.51125,12.21518,613.00164,2.38731,0.23007,1149.25191,0.48631,12.69634
std,159.99624,115.34002,98.47066,4.43908,61.90203,0.94328,10.94118,20.19376,0.71051,13.88346,6.04099,230.50891,8.90707,6.9173,272.93169,2.26681,0.29293,934.31671,0.37088,17.05545
min,19.11785,92.88802,16.25565,2.26866,23.408,0.13215,1.80618,13.06472,1.19646,17.92577,4.13235,202.7398,10.4232,0.40453,93.74258,0.09556,0.04025,302.2743,0.13703,0.27028
25%,38.17979,336.89098,208.29092,7.75123,87.67284,0.44891,23.14344,27.90341,1.55477,32.80139,8.03626,297.78172,18.66935,8.65249,436.80122,0.71165,0.08627,664.1723,0.26458,2.56954
50%,184.69855,411.35915,246.81225,11.07889,121.18675,0.99085,27.42358,37.40552,1.86312,41.7905,9.93753,418.9382,28.34315,11.97288,607.93035,1.5752,0.17282,859.3389,0.42397,3.8666
75%,329.36895,491.87915,307.14603,15.28558,159.73343,1.65781,34.12734,50.25457,2.46847,49.27809,15.28825,545.27132,32.91793,15.29058,732.62252,2.95702,0.20314,1273.3545,0.49107,16.94934
max,569.2859,650.6866,483.2489,16.40021,258.351,3.17641,53.69432,103.1622,4.15584,68.94528,26.14678,1140.011,45.10956,31.6176,1242.2,6.7299,1.39136,3660.162,1.80021,52.786


In [568]:
nutrition_db2.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0,3770.0
mean,147.62368,437.40977,206.09237,29.13842,97.12707,2.59778,22.89915,60.51086,3.15299,51.41984,12.44257,532.01939,28.05297,8.74708,851.17479,6.72738,0.30421,1168.79318,0.50982,16.28622
std,159.9925,183.07685,125.83504,22.8248,60.47892,2.50979,13.98167,59.87557,2.4433,29.10865,6.89345,267.47451,12.01118,6.79965,643.53732,8.81384,0.30164,2321.84247,0.3121,24.35803
min,2.247,51.34326,1.323,0.0,0.0,0.0,0.147,0.0,0.19784,0.99745,0.03511,21.35641,0.7505,0.01992,5.20206,0.0,0.00055,0.0,0.00497,0.0
25%,41.8956,302.9801,116.53312,10.22884,60.3915,0.74548,12.94812,15.88195,1.81476,33.2288,7.403,345.4553,19.97516,3.62944,418.67972,1.76526,0.1051,236.94223,0.28412,2.15411
50%,83.1664,407.19045,182.7579,24.7857,85.15604,1.94681,20.30644,38.70213,2.76656,45.931,11.15695,480.7348,26.86586,7.16939,734.111,3.90747,0.19759,581.0319,0.44613,6.66003
75%,193.59578,541.44173,271.86345,43.25593,126.76047,3.66756,30.20705,88.07055,3.93232,63.26032,16.5604,654.2057,34.31604,12.13675,1092.85675,7.83992,0.39693,1113.542,0.68317,19.56792
max,1264.326,1828.192,989.103,236.7205,708.85,27.83175,109.9003,519.5359,55.89075,590.3922,47.57578,1931.488,99.72639,59.31002,7322.289,104.8181,2.35457,38664.7,3.83736,270.4131


In [553]:
result_array = cdist(recommenden_nut_low_fat, nutrition_db3, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db3.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat = euclid_distance_sum_fat.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat[0:10]

Unnamed: 0,euclid_distance_sum
228367,3005.56365
12877,21071.50815
9043,27696.36338
34159,30357.05836
72068,32822.63747
86047,32886.58213
74142,40314.84759
52310,49390.10581
16998,49893.71803
11914,50598.45271


In [554]:
jaccard_euclid_joined_fat = pd.merge(euclid_distance_sum_fat, result_w_filter_100, left_index=True, right_index=True)

In [555]:
jaccard_euclid_joined_fat[0:20]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
228367,3005.56365,0.94768,34
12877,21071.50815,0.9562,94
9043,27696.36338,0.94413,23
34159,30357.05836,0.9548,73
72068,32822.63747,0.94961,44
86047,32886.58213,0.95668,96
74142,40314.84759,0.94728,32
52310,49390.10581,0.95231,58
16998,49893.71803,0.93739,9
11914,50598.45271,0.95275,60


In [556]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined_fat[0:20].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top20_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top20_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top20_fat.columns = nutrition_top20_fat.columns.droplevel(0)

# entferne alle NA
nutrition_top20_fat = nutrition_top20_fat.dropna()

In [557]:
nutrition_top20_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,160.67187,350.67709,138.26094,27.31692,84.9243,1.67415,15.36233,60.42228,2.54444,43.39233,12.60868,395.78786,24.83405,6.90554,822.79532,4.07454,0.27336,675.41923,0.44793,6.4262
std,130.78937,87.31382,45.44801,14.12786,41.89009,1.0957,5.04978,36.27908,1.2344,15.52454,7.09588,186.55297,9.77361,4.07377,858.15968,5.01952,0.19422,390.43007,0.26977,8.9038
min,19.87494,92.88802,16.25565,4.44503,23.408,0.10547,1.80618,8.9855,1.19646,17.92577,4.86628,179.0291,10.4232,0.40453,196.5423,0.1068,0.04025,79.045,0.08415,0.0
25%,47.88339,310.77785,102.50305,15.98734,50.03422,0.77748,11.38923,31.77384,1.55554,32.26539,6.01045,257.96525,15.47294,3.09384,383.35455,1.30467,0.09781,472.89077,0.24867,0.86175
50%,127.86225,374.2395,153.4023,27.44924,79.97533,1.91762,17.04469,57.78565,2.19814,40.28243,10.88178,356.8165,26.34727,7.56975,634.15585,3.29568,0.22612,663.03695,0.40181,2.68942
75%,282.11065,391.6118,174.84408,37.44271,111.61198,2.24282,19.42712,85.46954,3.4853,51.28204,19.06126,484.49532,33.27296,10.2282,912.33945,4.71994,0.36632,773.00425,0.62644,6.06492
max,435.6064,514.8928,183.1582,53.01272,172.52,3.75688,20.35091,134.628,4.97842,78.45733,24.7101,818.2569,39.08948,12.52908,4174.661,23.27062,0.70433,1621.32,0.93401,28.865


In [558]:
nutrition_db.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0
mean,146.2483,436.37865,205.34702,29.33343,96.53694,2.63156,22.81634,60.76312,3.1722,51.61716,12.29576,537.34508,27.76217,8.69277,846.59716,6.72692,0.30153,1221.15249,0.50863,16.72403
std,159.43529,208.87063,142.06021,23.54473,64.78237,2.63814,15.78447,60.62101,2.60926,32.32778,7.29221,325.02968,13.36258,7.23449,736.41251,9.17527,0.30493,2557.35848,0.34451,26.92288
min,0.93181,13.63194,0.9275,0.0,0.0,0.0,0.10306,0.0,0.07193,0.85861,0.03511,4.66014,0.44604,0.01992,2.67762,0.0,0.00012,0.0,0.00252,0.0
25%,41.70011,293.54442,110.92578,10.18247,58.52,0.72435,12.32509,15.83209,1.76229,31.96466,7.13552,330.8917,19.1071,3.47082,403.2909,1.70223,0.10243,226.36468,0.2693,2.03935
50%,82.97927,402.20005,179.08945,24.76714,83.21569,1.90935,19.89883,39.22277,2.7301,45.61357,10.96068,474.2425,26.43599,6.94796,719.31585,3.83273,0.19551,569.14505,0.43692,6.48954
75%,191.84663,543.9298,270.0743,43.31172,126.23355,3.67162,30.00825,88.13897,3.9511,63.71924,16.48592,660.26665,34.29652,11.96499,1084.2095,7.81531,0.3949,1118.612,0.68542,19.56792
max,1264.326,4709.199,3455.29,236.7205,979.7761,27.83175,383.9211,519.5359,55.89075,590.3922,109.0173,6063.162,273.2163,103.4358,22099.37,136.5051,3.25879,40386.87,5.98334,507.2771


In [559]:
result_array = cdist(recommenden_nut, nutrition_db3, 'minkowski', p=2, w=no_weight)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db3.index.values, index=recommenden_nut.index.values)

euclid_distance_sum = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum = euclid_distance_sum.sort_values(by='euclid_distance_sum')
euclid_distance_sum[0:10]

Unnamed: 0,euclid_distance_sum
8805,635.52549
8635,636.5652
9000,664.43886
8836,667.51755
16998,669.27187
214502,674.0085
9043,735.22167
196428,735.44302
72068,755.64468
19422,759.75811


In [560]:
jaccard_euclid_joined = pd.merge(euclid_distance_sum, result_w_filter_100, left_index=True, right_index=True)

In [561]:
jaccard_euclid_joined[0:20]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
8805,635.52549,0.94905,42
8635,636.5652,0.94891,41
9000,664.43886,0.94852,39
8836,667.51755,0.9505,49
16998,669.27187,0.93739,9
214502,674.0085,0.93953,15
9043,735.22167,0.94413,23
196428,735.44302,0.94338,20
72068,755.64468,0.94961,44
19422,759.75811,0.93188,3


In [562]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined[0:20].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top20 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top20.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top20.columns = nutrition_top20.columns.droplevel(0)

# entferne alle NA
nutrition_top20 = nutrition_top20.dropna()

In [563]:
nutrition_top20.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,139.67291,432.38593,205.54611,28.70548,87.95749,1.41157,22.83846,62.80706,2.36776,37.32161,12.98041,326.70598,26.66772,8.07104,401.90679,4.97491,0.2845,448.14096,0.38774,4.18025
std,134.72489,86.82371,64.96514,11.38519,34.55245,0.81901,7.21835,30.24442,0.80329,8.6518,6.67966,104.58874,9.84059,3.33231,203.63807,5.95048,0.12315,218.04806,0.22779,7.7407
min,19.11785,296.9438,87.79747,7.85645,49.8075,0.10547,9.75527,8.9855,1.29727,20.7208,5.0023,179.0291,13.13587,2.36103,93.74258,0.06328,0.08877,79.045,0.08415,0.0
25%,26.38823,364.4488,160.05197,22.14934,59.10922,0.78408,17.78355,45.5849,1.6346,32.47948,6.57829,238.49597,17.26798,5.66036,240.86055,1.48685,0.22193,294.68032,0.16809,0.34106
50%,83.84575,440.2627,208.55365,27.91405,79.61916,1.10479,23.17263,61.01816,2.32731,37.0757,11.64028,318.05215,26.16965,7.42372,407.4809,3.65357,0.28384,498.0525,0.38343,1.04343
75%,217.02835,485.96745,242.78433,36.81882,110.5628,1.85071,26.97604,84.91656,2.93891,41.55434,19.05475,386.7299,35.12858,10.2282,583.70127,5.08346,0.34072,592.62822,0.54009,3.15906
max,424.003,623.2719,371.526,53.01272,181.1298,3.22975,41.28066,134.628,4.1271,62.255,24.7101,546.856,40.7336,14.59043,687.007,23.27062,0.63331,785.6996,0.80909,28.865


In [564]:
nutrition_db.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0,4062.0
mean,146.2483,436.37865,205.34702,29.33343,96.53694,2.63156,22.81634,60.76312,3.1722,51.61716,12.29576,537.34508,27.76217,8.69277,846.59716,6.72692,0.30153,1221.15249,0.50863,16.72403
std,159.43529,208.87063,142.06021,23.54473,64.78237,2.63814,15.78447,60.62101,2.60926,32.32778,7.29221,325.02968,13.36258,7.23449,736.41251,9.17527,0.30493,2557.35848,0.34451,26.92288
min,0.93181,13.63194,0.9275,0.0,0.0,0.0,0.10306,0.0,0.07193,0.85861,0.03511,4.66014,0.44604,0.01992,2.67762,0.0,0.00012,0.0,0.00252,0.0
25%,41.70011,293.54442,110.92578,10.18247,58.52,0.72435,12.32509,15.83209,1.76229,31.96466,7.13552,330.8917,19.1071,3.47082,403.2909,1.70223,0.10243,226.36468,0.2693,2.03935
50%,82.97927,402.20005,179.08945,24.76714,83.21569,1.90935,19.89883,39.22277,2.7301,45.61357,10.96068,474.2425,26.43599,6.94796,719.31585,3.83273,0.19551,569.14505,0.43692,6.48954
75%,191.84663,543.9298,270.0743,43.31172,126.23355,3.67162,30.00825,88.13897,3.9511,63.71924,16.48592,660.26665,34.29652,11.96499,1084.2095,7.81531,0.3949,1118.612,0.68542,19.56792
max,1264.326,4709.199,3455.29,236.7205,979.7761,27.83175,383.9211,519.5359,55.89075,590.3922,109.0173,6063.162,273.2163,103.4358,22099.37,136.5051,3.25879,40386.87,5.98334,507.2771
