In [1]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [2]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [3]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [4]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [5]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [6]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [7]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [8]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [9]:
# random
subset_random_top10 = nutrition_db2.sample(n=10).copy()

In [10]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=40).copy()
subset_fat_id = subset_fat.index.to_numpy()


In [11]:
subset_normal = nutrition_db2.sample(n=40).copy()
subset_normal_id = subset_normal.index.to_numpy()

In [12]:
subset_normal.index.to_numpy()

array([ 14679,  12073,  84886,  24368, 229090,  12012,   9143,  45361,
        16603,  19943,  50579, 171368,  57071,  81222,  86649,  17862,
         8763,  17120,  34384,  70680,  31988, 151379, 240287,  14647,
       163894, 235432,  21003,  72311,  20684,  31065,  15407, 166292,
        60333,   9039,  39456, 109782,  17717,  12823, 216470,  13959])

In [13]:
subset_fat.index.to_numpy()

array([102677,  21352, 222509,   8747,  18093,  52501,  19856, 239896,
        36766,  86813,  11737, 230478,  11892, 111823,  31072,  23881,
       158587, 218085, 231233, 202881,  48921,  20312,   8556, 220515,
        23923,  13420,   8757,  14610,  42247,  18795, 208314,  23434,
        40286, 229277,  25137,  37677, 237320,  23157, 236103, 127500])

In [14]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [15]:
subset_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,182.19044,608.21842,336.48215,31.9787,138.04925,2.42907,37.3869,76.43211,3.86567,61.73156,14.74757,592.22181,34.18899,15.31001,930.25197,4.61853,0.45389,1868.22768,0.59322,11.28142
std,189.34252,116.42767,12.96024,25.55162,57.93036,2.86041,1.44003,96.99082,2.41814,49.35353,7.20601,310.61398,13.87148,4.53214,550.2225,6.07728,0.37107,5808.43244,0.33829,16.3392
min,19.121,402.8433,316.359,1.8722,36.552,0.0,35.151,6.7308,0.52164,16.00843,5.34861,255.3414,15.71987,5.7292,263.7425,0.0793,0.08727,0.5625,0.17025,0.0225
25%,42.61495,522.00257,326.16895,9.16919,100.84455,0.53693,36.24099,13.73089,2.69275,39.09463,9.89062,391.04413,24.77848,12.23351,500.27255,0.98664,0.17687,183.83725,0.35192,1.57534
50%,112.14975,587.5428,337.36705,28.14095,131.46735,1.36922,37.48523,51.79167,3.38713,48.1734,13.3391,521.0986,31.80968,13.942,768.10395,3.07298,0.36719,762.03385,0.52697,3.83369
75%,294.92285,694.6829,346.95983,46.92905,156.34817,3.00479,38.55109,94.94129,4.44902,69.32928,18.30405,733.45978,40.36773,19.10003,1172.9605,5.90758,0.49669,1143.9995,0.77407,11.03837
max,701.8085,881.3031,359.8603,87.90229,308.72,12.12185,39.98447,519.5359,14.15739,326.4414,35.70167,1931.488,93.23131,23.07542,2757.834,35.81975,1.35536,36741.7,1.90431,74.77966


In [16]:
subset_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,196.0991,485.47077,246.31665,30.89658,95.64285,2.25384,27.36852,50.3806,3.00218,48.78765,11.54192,498.52503,28.39946,10.24769,934.36089,7.41708,0.25297,895.06425,0.46755,8.72278
std,205.62021,227.27974,179.85616,25.84626,63.89711,2.69225,19.98401,57.44862,2.04032,26.13401,7.14851,277.5266,13.57031,8.46734,496.98896,9.13422,0.25525,1234.54219,0.30913,12.34968
min,6.12773,185.7788,47.26966,1.95229,0.0,0.0,5.25218,3.34426,0.72508,5.61653,0.21848,69.78,5.40403,1.97351,93.92116,0.11998,0.02235,2.05748,0.02044,0.29898
25%,40.5473,327.72412,123.55208,11.09381,54.8812,0.58224,13.728,17.04097,1.67852,29.51014,6.41973,349.03415,19.11413,4.06079,561.53305,2.47309,0.10145,106.41814,0.27285,1.89484
50%,86.61471,436.2152,188.9238,22.7919,88.07059,1.19657,20.99153,27.311,2.66452,44.32084,9.53437,443.116,24.55208,8.23403,891.321,4.21077,0.16699,409.3665,0.39399,3.53158
75%,278.5007,623.59995,301.85032,42.86154,126.52425,3.48576,33.53893,46.43787,3.41637,60.53861,16.87748,622.09455,38.36617,13.69192,1159.95925,9.47168,0.28311,1046.75475,0.69683,9.97948
max,790.4046,1285.399,989.103,93.62617,275.94,12.79123,109.9003,267.8251,11.15067,125.0226,25.03877,1584.164,64.74465,41.72193,2308.549,41.97056,1.22795,5345.74,1.52668,64.99386


In [17]:
user_recipes_fat = recipe_db[recipe_db.index.isin(subset_fat_id)]
user_recipes_normal = recipe_db[recipe_db.index.isin(subset_normal_id)]

In [18]:
####temp#######
temp_fat_20 = [16966,  14753,  62459,  18397,   8757,  52501,   8556, 240522,
        86628, 221304, 223596,  51653,  48921, 142220,  76763, 222509,
        17496,  14710,  86860,  86813]
####temp#######
user_recipes_fat = recipe_db[recipe_db.index.isin(temp_fat_20)]


In [19]:
user_recipes_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14710,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#top10 most common ingredients
#16421    2125 salt
#4342     1506 garlic
#4397     1412 onion
#16406    1185 ground black pepper
#16157    1016 butter
#6307      944 olive oil
#6494      786 skinless bone less chicken breast
#2496      749 water
#16238     574 grated Parmesan cheese
#16317     538 eggs


drop_id_list = [16421, 4342, 4397, 16406, 16157, 6307, 6494, 2496, 16238, 16317]





In [21]:
subset_fat_id

array([102677,  21352, 222509,   8747,  18093,  52501,  19856, 239896,
        36766,  86813,  11737, 230478,  11892, 111823,  31072,  23881,
       158587, 218085, 231233, 202881,  48921,  20312,   8556, 220515,
        23923,  13420,   8757,  14610,  42247,  18795, 208314,  23434,
        40286, 229277,  25137,  37677, 237320,  23157, 236103, 127500])

In [22]:
subset_normal_id

array([ 14679,  12073,  84886,  24368, 229090,  12012,   9143,  45361,
        16603,  19943,  50579, 171368,  57071,  81222,  86649,  17862,
         8763,  17120,  34384,  70680,  31988, 151379, 240287,  14647,
       163894, 235432,  21003,  72311,  20684,  31065,  15407, 166292,
        60333,   9039,  39456, 109782,  17717,  12823, 216470,  13959])

In [23]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)

#original
#new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=subset_fat_id)
#original


####temp#######
new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=temp_fat_20)
####temp#######
new_recipe_db_wo_userrecipes_normal = new_recipe_db.drop(axis=0, labels=subset_normal_id)

In [24]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes_fat = user_recipes_fat.drop(axis=1, labels=drop_id_list)
new_user_recipes_normal = user_recipes_normal.drop(axis=1, labels=drop_id_list)

In [25]:
sample_fat = new_user_recipes_fat.sample(n=20).copy()


In [26]:
sample_normal = new_user_recipes_normal.sample(n=40).copy()

In [27]:
sample_normal
sample_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17717,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34384,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60333,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45361,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
222509,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
221304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
240522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
## Jaccard - fat

result_array = cdist(sample_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=sample_fat.index.values)
# result_2

result_w_filter_10_fat = pd.DataFrame(result_w_filter_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_fat = result_w_filter_10_fat.sort_values(by='jaccard_distance_sum')
result_w_filter_10_fat['jaccard_distance_sum'] = result_w_filter_10_fat['jaccard_distance_sum'].div(20)
result_w_filter_10_fat[0:10]


Unnamed: 0,jaccard_distance_sum
8547,0.92198
188706,0.93331
70513,0.94038
141125,0.94441
62256,0.94456
231808,0.94471
90160,0.94768
240773,0.94804
18442,0.94897
14614,0.94936


In [29]:
## Jaccard - normal

result_array = cdist(sample_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=sample_normal.index.values)
# result_2

result_w_filter_10_normal = pd.DataFrame(result_w_filter_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_normal = result_w_filter_10_normal.sort_values(by='jaccard_distance_sum')
result_w_filter_10_normal['jaccard_distance_sum'] = result_w_filter_10_normal['jaccard_distance_sum'].div(20)
result_w_filter_10_normal[0:10]


Unnamed: 0,jaccard_distance_sum
72007,1.88822
44742,1.89108
24682,1.89121
16416,1.89288
20456,1.89362
13941,1.89672
24202,1.89793
78370,1.89885
14604,1.89902
18044,1.89992


In [30]:
# userrecipes fat
sample_fat.index.to_numpy()

array([222509,  76763,  48921,  62459,  17496,  18397, 221304, 240522,
        51653,  14753,  86628,  86813,   8757, 142220, 223596,   8556,
        14710,  16966,  52501,  86860])

In [31]:
# userrecipes normal
sample_normal.index.to_numpy()


array([ 14679,  50579,  17717,  31988,  81222,  34384,  57071,  60333,
         9039,  45361,  17862, 166292,  39456,  17120,  31065,  70680,
        12823,   9143, 240287, 235432,  72311, 109782,  21003,  24368,
        16603,  86649, 163894, 216470,  15407,  19943, 151379, 171368,
        12073,  20684,  84886, 229090,  12012,  13959,  14647,   8763])

In [32]:
result_w_filter_10_fat[0:10].index.to_numpy()

array([  8547, 188706,  70513, 141125,  62256, 231808,  90160, 240773,
        18442,  14614])

In [33]:
result_w_filter_10_normal[0:10].index.to_numpy()

array([72007, 44742, 24682, 16416, 20456, 13941, 24202, 78370, 14604,
       18044])

In [34]:
result_w_filter_fat_short = result_w_filter_fat.copy()
result_w_filter_normal_short = result_w_filter_normal.copy()

In [35]:
# top 10 fat short
fat_short = pd.DataFrame(np.sort(result_w_filter_fat_short.values, axis=0), index=result_w_filter_fat_short.index, columns=result_w_filter_fat_short.columns)

new_columns = fat_short.columns[fat_short.loc[fat_short.first_valid_index()].argsort()]
result_fat_short = fat_short[new_columns]
result_fat_short = result_fat_short.reset_index()
result_fat_short = result_fat_short.loc[1].to_frame()

result_fat_short[0:11].index.to_numpy()


array(['index', 8547, 188706, 34361, 206120, 220716, 16372, 14525, 235948,
       11678, 25884], dtype=object)

In [36]:
fat_short[new_columns]

Unnamed: 0,8547,188706,34361,206120,220716,16372,14525,235948,11678,25884,...,24038,8933,17815,8938,13952,223218,232907,24160,87053,193219
222509,0.0,0.25,0.33333,0.33333,0.33333,0.46154,0.5,0.5,0.5,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
76763,0.9,0.81818,0.875,0.875,0.875,0.8,0.91667,0.90909,0.88889,0.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
48921,0.90909,0.875,1.0,1.0,1.0,0.88235,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
62459,0.90909,0.9,1.0,1.0,1.0,0.95238,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
17496,0.92308,0.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
18397,0.92857,0.92308,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
221304,0.92857,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
240522,0.94118,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
51653,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
14753,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [37]:
sample_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
222509,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
221304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
240522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# top 10 normal short
normal_short = pd.DataFrame(np.sort(result_w_filter_normal_short.values, axis=0), index=result_w_filter_normal_short.index, columns=result_w_filter_normal_short.columns)

new_columns = normal_short.columns[normal_short.loc[normal_short.first_valid_index()].argsort()]
result_normal_short = normal_short[new_columns]
result_normal_short = result_normal_short.reset_index()
result_normal_short = result_normal_short.loc[0].to_frame()
result_normal_short[0:11].index.to_numpy()


array(['index', 228546, 8896, 36994, 143064, 216902, 220751, 81959,
       132703, 8732, 222182], dtype=object)

In [39]:
normal_short[new_columns]

Unnamed: 0,228546,8896,36994,143064,216902,220751,81959,132703,8732,222182,...,23892,11903,20582,239137,21333,11879,239096,40061,150353,73110
14679,0.16667,0.4,0.4,0.4,0.42857,0.42857,0.42857,0.42857,0.42857,0.44444,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50579,0.85714,0.8,0.83333,0.83333,0.77778,0.85714,0.81818,0.85714,0.81818,0.8125,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
17717,0.88889,0.84615,0.85714,0.85714,0.8,0.875,0.88889,0.88889,0.88889,0.89474,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
31988,0.9,0.875,0.88889,0.875,0.81818,0.88889,0.9,0.9,0.9,0.91667,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
81222,0.9,0.90909,1.0,0.9,0.88889,0.9,0.9,0.90909,0.9,0.92308,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
34384,0.90909,0.93333,1.0,0.90909,0.88889,0.90909,0.90909,0.90909,0.90909,0.92308,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
57071,0.90909,1.0,1.0,0.91667,0.9,0.90909,0.92308,0.90909,0.92308,0.92308,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60333,0.90909,1.0,1.0,0.92308,0.90909,0.90909,0.92308,0.90909,1.0,0.92857,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9039,0.90909,1.0,1.0,1.0,0.92308,0.90909,1.0,0.91667,1.0,0.92857,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
45361,0.91667,1.0,1.0,1.0,1.0,0.91667,1.0,0.92308,1.0,0.92857,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
sample_fat.index.to_numpy()

array([222509,  76763,  48921,  62459,  17496,  18397, 221304, 240522,
        51653,  14753,  86628,  86813,   8757, 142220, 223596,   8556,
        14710,  16966,  52501,  86860])

In [41]:
sample_normal.index.to_numpy()

array([ 14679,  50579,  17717,  31988,  81222,  34384,  57071,  60333,
         9039,  45361,  17862, 166292,  39456,  17120,  31065,  70680,
        12823,   9143, 240287, 235432,  72311, 109782,  21003,  24368,
        16603,  86649, 163894, 216470,  15407,  19943, 151379, 171368,
        12073,  20684,  84886, 229090,  12012,  13959,  14647,   8763])

In [42]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_fat = sample_fat.copy()
user_one_vector_fat.loc['sum',:]= user_one_vector_fat.sum(axis=0)
user_one_vector_fat = user_one_vector_fat.drop(axis=0, labels=sample_fat.index)
user_one_vector_fat[user_one_vector_fat > 0] = 1

In [43]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_normal = sample_normal.copy()
user_one_vector_normal.loc['sum',:]= user_one_vector_normal.sum(axis=0)
user_one_vector_normal = user_one_vector_normal.drop(axis=0, labels=sample_normal.index)
user_one_vector_normal[user_one_vector_normal > 0] = 1

In [44]:
user_one_vector_normal
user_one_vector_normal.apply(pd.value_counts).count(axis=1)

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0.00000    563
1.00000    157
dtype: int64

In [45]:
user_one_vector_fat
user_one_vector_fat.apply(pd.value_counts).count(axis=1)

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0.00000    626
1.00000     94
dtype: int64

In [46]:
## Jaccard - Vector fat top 10

result_array = cdist(user_one_vector_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_vector_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=user_one_vector_fat.index.values)


result_w_filter_vector_10_fat = pd.DataFrame(result_w_filter_vector_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_fat = result_w_filter_vector_10_fat.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_fat[0:10]


Unnamed: 0,jaccard_distance_sum
51850,0.88889
231396,0.89796
135885,0.9
52299,0.9
145843,0.90426
16372,0.90526
83083,0.90909
180905,0.90909
24682,0.91489
70012,0.91579


In [47]:
## Jaccard - Vector normal top 10

result_array = cdist(user_one_vector_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_vector_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=user_one_vector_normal.index.values)


result_w_filter_vector_10_normal = pd.DataFrame(result_w_filter_vector_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_normal = result_w_filter_vector_10_normal.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_normal[0:10]


Unnamed: 0,jaccard_distance_sum
237807,0.92405
111905,0.925
9005,0.93082
231396,0.93125
190490,0.93168
13988,0.9321
236805,0.9321
51850,0.9321
218075,0.93631
69754,0.93671


In [48]:
result_w_filter_vector_10_normal[0:10].index.to_numpy()

array([237807, 111905,   9005, 231396, 190490,  13988, 236805,  51850,
       218075,  69754])

In [49]:
####### results

In [50]:
# userrecipes fat
sample_fat.index.to_numpy()

array([222509,  76763,  48921,  62459,  17496,  18397, 221304, 240522,
        51653,  14753,  86628,  86813,   8757, 142220, 223596,   8556,
        14710,  16966,  52501,  86860])

In [51]:
###### jaccard fat top 10 naiv
result_w_filter_10_fat[0:10].index.to_numpy()

array([  8547, 188706,  70513, 141125,  62256, 231808,  90160, 240773,
        18442,  14614])

In [52]:
###### jaccard kurz fat top 10
result_fat_short[1:11].index.to_numpy()


array([8547, 188706, 34361, 206120, 220716, 16372, 14525, 235948, 11678,
       25884], dtype=object)

In [53]:
###### jaccard vector fat top 10
result_w_filter_vector_10_fat[0:10].index.to_numpy()

array([ 51850, 231396, 135885,  52299, 145843,  16372,  83083, 180905,
        24682,  70012])

In [54]:
# userrecipes normal
sample_normal.index.to_numpy()

array([ 14679,  50579,  17717,  31988,  81222,  34384,  57071,  60333,
         9039,  45361,  17862, 166292,  39456,  17120,  31065,  70680,
        12823,   9143, 240287, 235432,  72311, 109782,  21003,  24368,
        16603,  86649, 163894, 216470,  15407,  19943, 151379, 171368,
        12073,  20684,  84886, 229090,  12012,  13959,  14647,   8763])

In [55]:
###### jaccard normal top 10 naiv
result_w_filter_10_normal[0:10].index.to_numpy()

array([72007, 44742, 24682, 16416, 20456, 13941, 24202, 78370, 14604,
       18044])

In [56]:
###### jaccard kurz normal top 10
result_normal_short[1:11].index.to_numpy()


array([228546, 8896, 36994, 143064, 216902, 220751, 81959, 132703, 8732,
       222182], dtype=object)

In [57]:
###### jaccard vector normal top 10
result_w_filter_vector_10_normal[0:10].index.to_numpy()

array([237807, 111905,   9005, 231396, 190490,  13988, 236805,  51850,
       218075,  69754])

In [58]:
##### random top 10
subset_random_top10.index.to_numpy()

array([ 81124, 234462,  86813,  40549,   9009, 242134,  81140, 228109,
        13896,  13890])

In [119]:
nutrition_db2.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0
mean,147.65186,437.44356,206.11989,29.13837,97.10705,2.59745,22.90221,60.52532,3.15292,51.41953,12.44388,532.06962,28.05446,8.74674,851.15449,6.7255,0.30428,1169.01319,0.50992,16.28996
std,160.00437,183.08939,125.84039,22.82783,60.47445,2.51004,13.98227,59.87693,2.44362,29.11251,6.8939,267.49221,12.01243,6.80052,643.6215,8.81425,0.30165,2322.11124,0.31208,24.36018
min,2.247,51.34326,1.323,0.0,0.0,0.0,0.147,0.0,0.19784,0.99745,0.03511,21.35641,0.7505,0.01992,5.20206,0.0,0.00055,0.0,0.00497,0.0
25%,41.91497,302.9718,116.5374,10.22622,60.372,0.74534,12.9486,15.88967,1.81434,33.22178,7.40249,345.4748,19.97439,3.62848,418.6246,1.76465,0.10514,236.8913,0.2845,2.15387
50%,83.1685,407.2476,182.7628,24.77198,85.12875,1.94552,20.30698,38.70226,2.76635,45.93,11.16012,480.7832,26.86635,7.16552,734.007,3.90585,0.19772,581.4108,0.4463,6.66049
75%,193.6033,541.5573,271.8885,43.2574,126.737,3.66472,30.20983,88.10015,3.93263,63.26534,16.56103,654.2805,34.32086,12.13782,1093.027,7.83809,0.39695,1113.579,0.68318,19.57264
max,1264.326,1828.192,989.103,236.7205,708.85,27.83175,109.9003,519.5359,55.89075,590.3922,47.57578,1931.488,99.72639,59.31002,7322.289,104.8181,2.35457,38664.7,3.83736,270.4131


In [120]:
new_df = nutrition_db2[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [121]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0
mean,437.44356,206.11989,29.13837,22.90221,8.74674,28.05446,147.65186,51.41953,3.15292
std,183.08939,125.84039,22.82783,13.98227,6.80052,12.01243,160.00437,29.11251,2.44362
min,51.34326,1.323,0.0,0.147,0.01992,0.7505,2.247,0.99745,0.19784
25%,302.9718,116.5374,10.22622,12.9486,3.62848,19.97439,41.91497,33.22178,1.81434
50%,407.2476,182.7628,24.77198,20.30698,7.16552,26.86635,83.1685,45.93,2.76635
75%,541.5573,271.8885,43.2574,30.20983,12.13782,34.32086,193.6033,63.26534,3.93263
max,1828.192,989.103,236.7205,109.9003,59.31002,99.72639,1264.326,590.3922,55.89075


In [59]:
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t1/?report=objectonly
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t5/?report=objectonly
#männer alter 31 - 50 pro tag geteil durch 3 mahlzeiten

recommenden_nut_low_fat = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)


recommenden_nut_low_fat.loc['index'] = [
    # "Calcium":[1],
    266.67,
    # "Calories":[1],
    500,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    43.33,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    12.67,
    # "Fat":[1],16.67,
    10,
    # "Folate":[1],
    106.67, 
    # "Iron":[1],
    2,
    # "Magnesium":[1],
    116.67, 
    # "Niacin Equivalents":[1],
    4,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    0.3, 
    # "Vitamin A - IU":[1],
    208, 
    # "Vitamin B6":[1],
    0.36, 
    # "Vitamin C":[1]}
    25] 

In [60]:
no_weight = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_fat = [
    # "Calcium":[1],
    1000,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    100000,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    100,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_carbs = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1000000,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

In [61]:
result_w_filter_fat_100_vector = result_w_filter_vector_10_fat[0:100].copy()
result_w_filter_fat_100_vector['pos'] = np.arange(len(result_w_filter_fat_100_vector))
result_w_filter_fat_100_vector = result_w_filter_fat_100_vector.reset_index()
result_w_filter_fat_100_vector = result_w_filter_fat_100_vector.set_index('index')

In [62]:
result_w_filter_fat_100 = result_w_filter_10_fat[0:100].copy()
result_w_filter_fat_100['pos'] = np.arange(len(result_w_filter_fat_100))
result_w_filter_fat_100 = result_w_filter_fat_100.reset_index()
result_w_filter_fat_100 = result_w_filter_fat_100.set_index('index')

In [63]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_fat_100.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_fat_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_fat_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_fat_naiv.columns = nutrition_fat_naiv.columns.droplevel(0)

# entferne alle NA
nutrition_fat_naiv = nutrition_fat_naiv.dropna()

In [64]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_fat, nutrition_fat_naiv, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_fat_naiv.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat_naiv = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat_naiv = euclid_distance_sum_fat_naiv.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat_naiv[0:10]

Unnamed: 0,euclid_distance_sum
222635,2656.70711
235323,2963.70113
18442,3126.81472
218720,3205.85598
73964,3654.74357
84745,3763.03571
77758,4299.11754
64893,4542.86674
34361,4911.82372
14687,4928.41506


In [65]:
jaccard_euclid_joined_fat_naiv = pd.merge(euclid_distance_sum_fat_naiv, result_w_filter_fat_100, left_index=True, right_index=True)

In [66]:
jaccard_euclid_joined_fat_naiv[0:10]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
222635,2656.70711,0.95611,32
235323,2963.70113,0.95855,56
18442,3126.81472,0.94897,8
218720,3205.85598,0.95931,61
73964,3654.74357,0.95425,21
84745,3763.03571,0.96102,87
77758,4299.11754,0.95698,43
64893,4542.86674,0.9539,19
34361,4911.82372,0.96042,74
14687,4928.41506,0.96019,68


In [67]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined_fat_naiv[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_fat_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_fat_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_fat_naiv.columns = nutrition_top10_fat_naiv.columns.droplevel(0)

# entferne alle NA
nutrition_top10_fat_naiv = nutrition_top10_fat_naiv.dropna()

In [68]:
nutrition_top10_fat_naiv.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,212.15929,347.26522,140.72331,29.57564,75.88731,3.13171,15.63592,70.16376,3.28521,43.60824,9.33045,495.32276,21.98825,6.81862,726.05904,5.48924,0.26014,765.58346,0.36334,26.51701
std,82.36201,102.79235,25.5505,20.57749,22.26041,2.20438,2.83894,58.62786,1.81904,16.10101,3.46433,165.79725,5.96208,1.84304,269.68843,2.73489,0.18882,597.76996,0.15497,38.15255
min,115.4559,200.5096,96.83125,8.50724,53.31907,0.18479,10.75903,19.02514,1.61694,22.19329,6.48622,233.5959,11.25829,3.16377,135.44,1.53699,0.06435,173.6259,0.16549,0.24869
25%,169.32903,321.24892,128.54217,20.18204,56.19729,2.15752,14.28247,51.08488,2.39708,31.66494,7.25196,362.11325,19.86547,6.17503,583.50835,3.21599,0.12469,519.63588,0.2325,2.78399
50%,189.0762,343.44495,134.31735,24.76168,71.35308,2.62511,14.92415,56.93271,3.00156,43.27683,8.76698,520.37845,21.88125,6.95089,814.45765,5.27446,0.24354,603.88525,0.36174,7.66879
75%,250.67625,370.21308,154.89895,28.30338,94.22309,3.5301,17.211,64.72732,3.33173,47.02484,9.56176,624.49435,24.10793,8.07506,889.37787,7.74314,0.30506,745.44445,0.48236,28.78732
max,348.3654,587.8757,179.0071,75.33678,109.2033,7.9905,19.88968,232.0606,8.11861,74.57674,18.56014,707.6725,33.57063,9.21249,1034.152,9.49131,0.71992,2207.053,0.57304,106.7028


In [69]:
nutrition_top10_fat_naiv.index.to_numpy()

array([ 14687,  18442,  34361,  64893,  73964,  77758,  84745, 218720,
       222635, 235323])

In [70]:
new_df = nutrition_top10_fat_naiv[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [71]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,347.26522,140.72331,29.57564,15.63592,6.81862,21.98825,212.15929,43.60824,3.28521
std,102.79235,25.5505,20.57749,2.83894,1.84304,5.96208,82.36201,16.10101,1.81904
min,200.5096,96.83125,8.50724,10.75903,3.16377,11.25829,115.4559,22.19329,1.61694
25%,321.24892,128.54217,20.18204,14.28247,6.17503,19.86547,169.32903,31.66494,2.39708
50%,343.44495,134.31735,24.76168,14.92415,6.95089,21.88125,189.0762,43.27683,3.00156
75%,370.21308,154.89895,28.30338,17.211,8.07506,24.10793,250.67625,47.02484,3.33173
max,587.8757,179.0071,75.33678,19.88968,9.21249,33.57063,348.3654,74.57674,8.11861


In [72]:
result_w_filter_fat_100_short = result_fat_short[1:101].index.to_numpy().copy()


In [73]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_fat_short[1:101].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_fat_short = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_fat_short.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_fat_short.columns = nutrition_fat_short.columns.droplevel(0)

# entferne alle NA
nutrition_fat_short = nutrition_fat_short.dropna()

In [74]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_fat, nutrition_fat_short, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_fat_short.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat_short = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat_short = euclid_distance_sum_fat_short.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat_short[0:10]

Unnamed: 0,euclid_distance_sum
235323,2963.70113
18442,3126.81472
8965,3288.17839
17869,3301.38331
52148,3392.22542
73964,3654.74357
21297,4087.73266
11916,4453.50712
34361,4911.82372
14724,4969.35463


In [75]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(euclid_distance_sum_fat_short[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_fat_short = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_fat_short.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_fat_short.columns = nutrition_top10_fat_short.columns.droplevel(0)

# entferne alle NA
nutrition_top10_fat_short = nutrition_top10_fat_short.dropna()

In [76]:
nutrition_top10_fat_short.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,246.13638,376.5342,135.33025,34.87812,69.31935,4.13922,15.03669,99.12528,3.21327,56.35856,11.34105,617.69192,24.47273,6.29289,797.67408,6.75959,0.39576,1104.28851,0.47839,41.1223
std,93.92264,117.27508,27.33583,19.38038,37.53288,2.64418,3.03731,64.34844,1.02763,12.21409,5.31644,196.4877,9.16575,1.35276,273.34081,3.73947,0.35304,1035.22989,0.22483,42.60456
min,120.13,236.8492,75.64122,10.2685,22.48,0.6075,8.40458,17.745,1.85349,39.66,6.53378,417.7853,11.25829,4.18225,135.44,3.40015,0.06435,90.8075,0.23307,0.061
25%,176.58055,316.09415,127.64125,23.62964,55.34107,2.66736,14.18237,59.8402,2.44834,45.85537,7.18329,487.32785,19.06985,5.22905,740.28135,4.23192,0.1666,646.7083,0.31175,7.41081
50%,241.66255,326.81905,133.37395,31.10388,65.785,3.5866,14.81932,79.54997,3.17201,60.56481,9.33337,563.69635,22.24039,6.67609,864.6123,5.647,0.2284,835.2098,0.42745,24.16721
75%,337.26127,416.24435,152.42108,40.58051,75.97485,4.21726,16.93567,131.91579,3.56851,64.26294,13.28304,704.4456,31.09789,7.12775,981.51135,7.74314,0.60273,1068.84575,0.55425,76.6328
max,352.9491,649.576,170.7775,75.25994,153.695,9.46078,18.97528,232.0606,5.33774,74.38944,21.90084,1074.648,40.23982,8.03736,1030.996,15.84195,1.16597,3895.998,0.88844,106.7028


In [77]:
nutrition_top10_fat_short.index.to_numpy()

array([  8965,  11916,  14724,  17869,  18442,  21297,  34361,  52148,
        73964, 235323])

In [106]:
new_df = nutrition_top10_fat_short[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [107]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,376.5342,135.33025,34.87812,15.03669,6.29289,24.47273,246.13638,56.35856,3.21327
std,117.27508,27.33583,19.38038,3.03731,1.35276,9.16575,93.92264,12.21409,1.02763
min,236.8492,75.64122,10.2685,8.40458,4.18225,11.25829,120.13,39.66,1.85349
25%,316.09415,127.64125,23.62964,14.18237,5.22905,19.06985,176.58055,45.85537,2.44834
50%,326.81905,133.37395,31.10388,14.81932,6.67609,22.24039,241.66255,60.56481,3.17201
75%,416.24435,152.42108,40.58051,16.93567,7.12775,31.09789,337.26127,64.26294,3.56851
max,649.576,170.7775,75.25994,18.97528,8.03736,40.23982,352.9491,74.38944,5.33774


In [78]:
result_w_filter_fat_100_vector = result_w_filter_vector_10_fat[0:100].copy()
result_w_filter_fat_100_vector['pos'] = np.arange(len(result_w_filter_fat_100_vector))
result_w_filter_fat_100_vector = result_w_filter_fat_100_vector.reset_index()
result_w_filter_fat_100_vector = result_w_filter_fat_100_vector.set_index('index')

In [79]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_fat_100_vector.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_fat_vector = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_fat_vector.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_fat_vector.columns = nutrition_fat_vector.columns.droplevel(0)

# entferne alle NA
nutrition_fat_vector = nutrition_fat_vector.dropna()

In [80]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_fat, nutrition_fat_vector, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_fat_vector.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat_vector = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat_vector = euclid_distance_sum_fat_vector.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat_vector[0:10]

Unnamed: 0,euclid_distance_sum
222635,2656.70711
22751,2884.62922
218720,3205.85598
73964,3654.74357
47764,3694.71307
125646,3990.04083
51850,4309.87153
213108,4646.40433
213742,4690.02647
86515,4755.09357


In [81]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(euclid_distance_sum_fat_vector[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_fat_vector = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_fat_vector.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_fat_vector.columns = nutrition_top10_fat_vector.columns.droplevel(0)

# entferne alle NA
nutrition_top10_fat_vector = nutrition_top10_fat_vector.dropna()

In [82]:
nutrition_top10_fat_vector.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,220.52175,331.05631,132.62326,28.41211,63.10973,3.76223,14.73592,77.04844,3.3878,53.94445,8.25789,551.8501,21.78478,6.02939,635.90571,5.13299,0.23259,1543.57442,0.37947,27.14741
std,74.32726,59.43686,39.81566,15.33014,30.57142,2.63906,4.42396,38.64705,1.24772,21.86293,1.51348,147.3885,5.28264,2.71222,234.36754,2.16354,0.1082,1142.29737,0.11479,21.30309
min,141.8622,225.3194,69.70808,10.2685,13.125,1.54934,7.74534,30.26573,2.27139,26.74829,6.38666,233.5959,13.60084,1.43604,251.829,1.75411,0.09686,390.7916,0.17184,3.5675
25%,157.40613,322.22235,106.55367,16.8523,54.67246,2.15752,11.83929,51.66255,2.76399,45.23232,7.25584,502.0589,19.92655,4.59974,523.47958,3.64864,0.16514,657.50013,0.32859,6.43829
50%,205.8262,342.15345,133.92645,27.30739,69.31846,2.71036,14.88071,68.84002,3.00156,48.58591,7.63795,532.84835,22.02222,6.31056,623.1429,5.27446,0.22213,1021.4543,0.38966,32.3401
75%,264.83247,374.03538,155.57768,30.44074,75.77294,4.60132,17.28641,102.38766,3.52882,57.63681,9.00264,667.16242,24.22196,8.07046,834.67255,6.97998,0.28556,2611.8025,0.47795,39.1125
max,348.3654,406.0063,193.9669,54.42183,109.2033,10.10235,21.55188,155.8812,6.70979,110.1314,11.35271,742.503,31.03836,9.54182,939.8853,7.85019,0.40852,3368.707,0.49912,68.26264


In [83]:
nutrition_top10_fat_vector.index.to_numpy()

array([ 22751,  47764,  51850,  73964,  86515, 125646, 213108, 213742,
       218720, 222635])

In [108]:
new_df = nutrition_top10_fat_vector[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [109]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,331.05631,132.62326,28.41211,14.73592,6.02939,21.78478,220.52175,53.94445,3.3878
std,59.43686,39.81566,15.33014,4.42396,2.71222,5.28264,74.32726,21.86293,1.24772
min,225.3194,69.70808,10.2685,7.74534,1.43604,13.60084,141.8622,26.74829,2.27139
25%,322.22235,106.55367,16.8523,11.83929,4.59974,19.92655,157.40613,45.23232,2.76399
50%,342.15345,133.92645,27.30739,14.88071,6.31056,22.02222,205.8262,48.58591,3.00156
75%,374.03538,155.57768,30.44074,17.28641,8.07046,24.22196,264.83247,57.63681,3.52882
max,406.0063,193.9669,54.42183,21.55188,9.54182,31.03836,348.3654,110.1314,6.70979


In [84]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_10_fat.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

random_nut_fat_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

random_nut_fat_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
random_nut_fat_naiv.columns = random_nut_fat_naiv.columns.droplevel(0)

# entferne alle NA
random_nut_fat_naiv = random_nut_fat_naiv.dropna()


In [85]:
asd = random_nut_fat_naiv.sort_values(by='Fat')

In [86]:
asd[0:10].describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,38.70887,186.72419,8.5929,29.76577,38.54453,2.73469,0.95477,44.79653,2.01912,39.79316,6.3333,482.15195,14.43233,0.18719,699.62336,7.82159,0.19012,950.15627,0.41897,17.61181
std,13.91447,88.90485,3.00284,25.51754,54.26492,2.79798,0.33365,47.34177,1.29271,22.17036,3.59982,462.21356,10.36769,0.07805,425.08936,10.32595,0.22016,1004.53875,0.32838,20.78962
min,22.2465,67.54216,1.323,1.9255,0.0,0.26294,0.147,4.35167,0.35016,8.685,0.54284,152.8917,0.7505,0.01992,183.8796,0.58219,0.00985,7.38,0.06808,1.40368
25%,29.03908,144.54335,8.60387,12.03016,0.0,1.03581,0.95599,12.82424,1.0943,29.74379,3.95787,185.0815,7.18936,0.14782,384.3189,3.15031,0.04068,332.61125,0.18691,6.52157
50%,37.62199,165.2826,9.6192,24.0673,21.98067,1.64616,1.0688,25.03194,1.9947,36.51286,6.34558,313.9214,11.93949,0.19185,606.5583,3.95583,0.0793,665.54215,0.31446,9.33288
75%,46.88089,215.7368,9.96918,42.23227,57.18543,2.73951,1.10769,64.98426,2.25283,42.8675,8.36965,537.11617,19.40237,0.24751,882.4669,5.24654,0.23163,1124.8705,0.56084,20.23434
max,68.01366,397.88,11.42801,88.31117,172.575,8.96175,1.26978,134.8421,4.33315,91.76,12.94783,1587.047,31.80096,0.27537,1469.935,34.79835,0.59393,3392.832,1.1023,72.693


In [87]:
random_nut_fat = nutrition_db2.sort_values(by='Fat')

In [88]:
random_nut_fat[0:10].index.to_numpy()

array([ 14725,  53194, 223269, 216688,  23444,  12768,  50939,  99480,
        13963,  19478])

In [89]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(random_nut_fat[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

random_nut_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

random_nut_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
random_nut_fat.columns = random_nut_fat.columns.droplevel(0)

# entferne alle NA
random_nut_fat = random_nut_fat.dropna()

In [90]:
random_nut_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,38.70887,186.72419,8.5929,29.76577,38.54453,2.73469,0.95477,44.79653,2.01912,39.79316,6.3333,482.15195,14.43233,0.18719,699.62336,7.82159,0.19012,950.15627,0.41897,17.61181
std,13.91447,88.90485,3.00284,25.51754,54.26492,2.79798,0.33365,47.34177,1.29271,22.17036,3.59982,462.21356,10.36769,0.07805,425.08936,10.32595,0.22016,1004.53875,0.32838,20.78962
min,22.2465,67.54216,1.323,1.9255,0.0,0.26294,0.147,4.35167,0.35016,8.685,0.54284,152.8917,0.7505,0.01992,183.8796,0.58219,0.00985,7.38,0.06808,1.40368
25%,29.03908,144.54335,8.60387,12.03016,0.0,1.03581,0.95599,12.82424,1.0943,29.74379,3.95787,185.0815,7.18936,0.14782,384.3189,3.15031,0.04068,332.61125,0.18691,6.52157
50%,37.62199,165.2826,9.6192,24.0673,21.98067,1.64616,1.0688,25.03194,1.9947,36.51286,6.34558,313.9214,11.93949,0.19185,606.5583,3.95583,0.0793,665.54215,0.31446,9.33288
75%,46.88089,215.7368,9.96918,42.23227,57.18543,2.73951,1.10769,64.98426,2.25283,42.8675,8.36965,537.11617,19.40237,0.24751,882.4669,5.24654,0.23163,1124.8705,0.56084,20.23434
max,68.01366,397.88,11.42801,88.31117,172.575,8.96175,1.26978,134.8421,4.33315,91.76,12.94783,1587.047,31.80096,0.27537,1469.935,34.79835,0.59393,3392.832,1.1023,72.693


In [91]:
asd[0:50]

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
14725,24.03237,67.54216,1.323,18.46258,0.0,0.936,0.147,4.35167,0.35016,8.685,0.54284,152.8917,0.7505,0.01992,337.3949,15.7245,0.03643,285.3,0.06808,18.40447
53194,49.3275,397.88,5.74515,88.31117,0.0,8.96175,0.63835,116.2275,4.33315,91.76,8.31711,1587.047,10.68192,0.14438,604.135,2.96258,0.57003,7.38,1.1023,72.693
223269,37.0575,231.435,8.6022,37.917,172.575,1.08475,0.9558,4.3875,3.99262,30.9375,6.65573,183.105,18.63247,0.25577,1469.935,34.79835,0.02744,1199.125,0.12256,5.75887
216688,22.2465,118.9055,8.60886,13.64523,46.748,1.15844,0.95654,25.0495,1.0777,23.977,5.88574,314.956,13.19706,0.27537,244.447,5.25938,0.06999,474.545,0.39121,20.8443
23444,38.18649,209.8754,9.39131,44.33629,0.0,2.80772,1.04348,75.07356,2.05814,34.38844,3.31525,185.7478,5.09345,0.15813,877.0996,3.7135,0.21408,3392.832,0.23771,25.25806
12768,28.54063,142.4127,9.84708,1.9255,60.66458,0.26294,1.09412,11.49708,0.89822,40.17102,8.38717,606.386,29.66508,0.13887,183.8796,0.58219,0.00985,143.7934,0.56049,6.59186
50939,39.96267,158.6358,9.87864,11.49181,43.92533,1.0195,1.09763,34.71634,1.93125,43.76633,12.94783,329.3067,19.659,0.27302,1261.014,5.20803,0.08861,757.764,0.56095,6.49813
99480,30.53441,217.6906,9.99935,43.6707,0.036,2.53486,1.11104,134.8421,2.30272,38.63728,6.03543,184.8594,7.88552,0.21036,525.0909,2.05569,0.59393,902.107,0.1807,1.40368
13963,68.01366,150.9353,11.1054,29.67201,0.0,6.44711,1.23393,25.01438,2.10318,29.34588,2.40721,312.8868,6.95731,0.22275,608.9816,3.93303,0.23748,1765.396,0.20554,10.09619
19478,49.18696,171.9294,11.42801,8.22539,61.49643,2.13388,1.26978,16.80571,1.14409,56.26312,8.83872,964.3331,31.80096,0.17333,884.256,3.97864,0.05341,573.3203,0.76017,8.56958


In [92]:
asd[0:10].index.to_numpy()

array([ 14725,  53194, 223269, 216688,  23444,  12768,  50939,  99480,
        13963,  19478])

In [93]:
nutrition_fat_naiv.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,193.58762,542.33307,280.92923,35.24085,126.84912,3.11108,31.21436,82.54184,3.74909,51.49306,12.97124,588.9095,30.01276,13.91145,878.25259,5.90811,0.34132,1067.70516,0.46588,20.47902
std,152.15803,233.04451,142.35312,28.64292,53.88096,3.21868,15.81701,66.17241,1.6952,21.31506,5.61848,280.87722,11.85509,8.19141,536.23257,4.45822,0.29503,1583.40654,0.23021,29.69788
min,21.16217,178.0237,81.26787,3.96258,42.622,0.0525,9.02976,5.91633,0.63185,14.54606,3.28438,97.71526,7.24539,2.95201,90.84255,0.35572,0.02349,24.77542,0.15045,0.05513
25%,86.60862,347.4664,163.6954,18.63982,77.00405,1.38853,18.18838,38.24905,2.52238,35.8792,9.08077,384.01743,21.36922,6.9111,559.1054,2.95941,0.14412,327.33435,0.28597,2.29055
50%,135.24265,521.0789,257.11545,29.01693,121.8922,2.31647,28.56839,61.06267,3.41979,48.47376,12.01238,544.8653,27.71758,12.81584,770.0264,4.64207,0.254,708.12,0.43604,6.28739
75%,270.9939,671.0417,358.3751,44.43137,162.93713,3.81326,39.81945,109.30135,4.77342,64.83919,16.13937,735.3778,37.10145,17.99064,1057.4165,7.94706,0.41822,1275.62625,0.57877,23.21081
max,805.7242,1828.192,669.4866,236.7205,255.2775,27.83175,74.38741,356.0612,11.15067,122.8545,34.42963,1621.572,73.59243,37.37759,2885.613,21.95344,1.56088,12034.2,1.15913,110.5156


In [94]:
subset_fat_asd = nutrition_fat_naiv[(nutrition_fat_naiv['Fat'] >= 10) & (nutrition_fat_naiv['Fat'] <= 15)].sample(n=8).copy()

In [115]:
subset_fat_asd.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,162.80304,334.50117,123.01987,32.78413,67.18158,4.00545,13.66888,91.52332,3.08246,48.43924,8.95656,608.06613,20.27056,5.84605,617.81806,6.73912,0.2728,1990.47048,0.43365,32.21983
std,118.70006,86.45515,11.0902,19.87579,11.43892,2.94701,1.23224,70.15791,1.11092,20.77865,2.33444,464.87719,4.59847,1.50548,349.37536,4.284,0.21137,4074.32996,0.31372,35.18379
min,42.2215,200.5096,96.83125,8.50724,53.31907,0.18479,10.75903,19.02514,1.61694,22.19329,6.48622,202.2782,11.25829,3.22836,132.1991,2.23212,0.06435,124.5866,0.16549,0.24869
25%,88.1635,287.39178,122.28388,19.57224,55.15042,2.41122,13.5871,54.77253,2.16691,37.15158,7.09761,287.17927,18.83971,5.27881,422.6006,3.92589,0.1452,405.76583,0.24251,1.50123
50%,117.79295,338.0464,127.1827,27.61447,71.05518,2.62511,14.13142,63.03995,3.1824,47.48286,8.56762,491.94015,21.3981,5.76511,665.69385,6.00521,0.20866,592.38995,0.35662,19.70184
75%,212.52058,394.0836,128.54217,54.38408,72.83828,6.15036,14.28247,106.04683,3.77064,54.7819,10.08627,733.3768,22.93564,6.74809,836.35553,8.35532,0.3526,786.9942,0.50485,61.95011
max,348.3654,451.0129,131.3139,55.97997,83.48,8.341,14.59043,232.0606,4.67316,88.1089,13.20469,1621.572,25.49099,8.03736,1054.259,15.31507,0.71992,12034.2,1.12891,85.117


In [116]:
new_df = subset_fat_asd[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [118]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,334.50117,123.01987,32.78413,13.66888,5.84605,20.27056,162.80304,48.43924,3.08246
std,86.45515,11.0902,19.87579,1.23224,1.50548,4.59847,118.70006,20.77865,1.11092
min,200.5096,96.83125,8.50724,10.75903,3.22836,11.25829,42.2215,22.19329,1.61694
25%,287.39178,122.28388,19.57224,13.5871,5.27881,18.83971,88.1635,37.15158,2.16691
50%,338.0464,127.1827,27.61447,14.13142,5.76511,21.3981,117.79295,47.48286,3.1824
75%,394.0836,128.54217,54.38408,14.28247,6.74809,22.93564,212.52058,54.7819,3.77064
max,451.0129,131.3139,55.97997,14.59043,8.03736,25.49099,348.3654,88.1089,4.67316


In [96]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(new_recipe_db_wo_userrecipes_fat.sample(n=10, random_state=0).index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_random_10 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_random_10.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_random_10.columns = nutrition_random_10.columns.droplevel(0)

# entferne alle NA
nutrition_random_10 = nutrition_random_10.dropna()

In [97]:
nutrition_random_10.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,137.90247,451.04469,211.85386,31.49681,82.84401,2.89383,23.53932,74.5543,3.44682,51.44868,12.6177,499.74915,28.10035,9.71581,719.19517,5.93412,0.27628,797.43848,0.47752,22.14323
std,128.19425,128.11832,112.91222,19.86946,23.58573,2.13513,12.5458,63.62439,1.83572,25.37871,4.65311,228.03533,6.53025,10.16515,447.6618,4.63905,0.22714,634.90664,0.12033,24.8011
min,32.86518,279.2421,108.2086,2.57644,41.49303,0.31548,12.02318,11.60304,1.22664,27.07817,7.11536,262.1083,21.15413,2.13052,82.90843,0.1345,0.08658,79.34875,0.27633,0.0
25%,45.73175,371.96933,135.81108,18.95888,68.72561,1.3943,15.09012,33.66391,1.97259,33.18492,8.49876,340.3263,22.94277,4.65956,304.59725,2.14126,0.16069,320.54398,0.38944,5.34257
50%,69.95123,428.61385,178.1548,29.86084,78.14209,2.72588,19.79498,53.02626,3.44418,38.86832,12.86435,400.11105,26.50746,7.84864,814.1019,6.20918,0.17573,718.8144,0.5167,11.51254
75%,195.9075,497.28538,240.62243,43.23842,99.89125,3.86521,26.73582,90.83728,4.3505,66.89249,15.29302,699.12432,31.21765,9.31209,1118.098,7.57461,0.32586,969.5833,0.55993,37.01577
max,376.1217,654.7633,490.6887,66.52579,118.39,7.56122,54.52096,199.0158,7.47058,100.5429,20.54744,874.2894,41.70995,37.4432,1285.509,13.47391,0.83562,2185.772,0.62032,72.18467


In [98]:
new_df = nutrition_random_10[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [99]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,451.04469,211.85386,31.49681,23.53932,9.71581,28.10035,137.90247,51.44868,3.44682
std,128.11832,112.91222,19.86946,12.5458,10.16515,6.53025,128.19425,25.37871,1.83572
min,279.2421,108.2086,2.57644,12.02318,2.13052,21.15413,32.86518,27.07817,1.22664
25%,371.96933,135.81108,18.95888,15.09012,4.65956,22.94277,45.73175,33.18492,1.97259
50%,428.61385,178.1548,29.86084,19.79498,7.84864,26.50746,69.95123,38.86832,3.44418
75%,497.28538,240.62243,43.23842,26.73582,9.31209,31.21765,195.9075,66.89249,4.3505
max,654.7633,490.6887,66.52579,54.52096,37.4432,41.70995,376.1217,100.5429,7.47058


In [100]:
nutrition_random_10.index.to_numpy()

array([  8600,  16849,  26615,  27819,  30794,  75672, 142951, 158429,
       177497, 216928])

In [101]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(user_recipes_fat.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_user_recipes_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_user_recipes_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_user_recipes_fat.columns = nutrition_user_recipes_fat.columns.droplevel(0)

# entferne alle NA
nutrition_user_recipes_fat = nutrition_user_recipes_fat.dropna()

In [102]:
new_df = nutrition_user_recipes_fat[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [103]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,563.83977,334.64292,24.81982,37.18255,15.10086,32.44829,144.61405,62.11656,3.07888
std,92.24917,14.74709,16.54758,1.63857,5.02136,10.22158,126.89385,32.62118,1.33454
min,390.7198,315.5125,1.27936,35.05695,5.39091,14.9787,22.91706,21.99075,0.58999
25%,500.82482,321.0012,13.55052,35.6668,12.2637,24.81829,57.0225,44.04216,2.2158
50%,549.52315,335.2659,24.81457,37.25176,14.09328,32.584,112.8676,54.51962,3.00261
75%,606.45907,347.85147,30.89026,38.65017,18.68514,38.42296,159.3108,70.92839,4.16262
max,754.5421,356.9908,68.65463,39.66564,22.90133,51.97705,448.9153,148.3127,5.45481


In [104]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_10_fat[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_jaccard_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_jaccard_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_jaccard_fat.columns = nutrition_jaccard_fat.columns.droplevel(0)

# entferne alle NA
nutrition_jaccard_fat = nutrition_jaccard_fat.dropna()