In [244]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [245]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [246]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [247]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [248]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [249]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [250]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [251]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [252]:
# random
subset_random_top10 = nutrition_db2.sample(n=10).copy()

In [253]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=40).copy()
subset_fat_id = subset_fat.index.to_numpy()


In [254]:
subset_normal = nutrition_db2.sample(n=40).copy()
subset_normal_id = subset_normal.index.to_numpy()

In [255]:
subset_normal.index.to_numpy()

array([ 13890,  58348, 240593,  74188,  22671, 148818,  46859,  72613,
       244507,  69754, 223058,  72712,   9037,  75115, 212511,   8990,
       143105,  20618,  72804,  92525,  71251, 219005,  94113, 213497,
        16566,  18883,  23691,  86415,  37022, 231533, 214617,  21076,
        18130,  24851,  17807,   8758,   9143,  21317, 240502,  25146])

In [256]:
subset_fat.index.to_numpy()

array([236703,  87137,   8855,  15024,  82487, 223596,  14610,  23881,
       223005,  16794,  48873, 235171,   8998, 199688,  19484,  17022,
        21352, 148970,   8693, 219766,  69538, 132511,  52299,  18059,
       147305,  11757,   8639,  16756, 156232,  84774, 231537, 213748,
       219173,   8772,  14592,  17496, 169974, 151997,  17169,  16429])

In [257]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [258]:
subset_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,157.72578,610.56563,336.51414,35.10243,134.21295,2.99279,37.39046,65.66339,4.35112,56.3408,14.91191,590.54623,34.33186,14.064,1106.41721,10.3348,0.25431,1210.00575,0.55539,22.7023
std,128.9609,132.43712,13.06915,29.81708,61.83977,3.17401,1.45213,60.02029,4.52088,28.48468,7.85325,246.25798,11.76542,4.43835,798.46029,17.90307,0.23677,1068.02383,0.27447,31.22216
min,30.76337,425.0375,315.4795,1.23348,0.0,0.0,35.05328,3.39608,0.52164,3.57022,0.1561,198.2903,8.75428,5.3459,198.2495,0.09294,0.00777,92.58968,0.01814,1.4075
25%,56.88125,505.8767,323.10667,13.61048,82.35536,1.12808,35.90074,22.78383,2.79012,38.9638,9.04509,443.26707,25.29728,11.45838,670.73403,2.93199,0.1286,502.66787,0.3596,3.5761
50%,115.25615,587.591,338.01915,26.9701,133.074,1.8379,37.55768,49.44646,3.62218,52.35764,14.3532,514.5145,34.1375,13.53264,943.48605,4.57093,0.18117,824.65815,0.47314,8.28085
75%,220.64058,676.27487,346.75815,47.96036,174.77125,3.83399,38.52868,86.26763,5.11716,61.80344,19.86615,751.3328,43.44811,16.66782,1398.0685,10.15844,0.33688,1450.031,0.77755,24.86832
max,534.9875,905.355,358.7781,107.0733,267.995,14.87823,39.86424,297.4165,30.22156,138.7532,33.33672,1406.06,59.51241,23.50516,4604.161,104.8181,1.3139,4561.219,1.25088,147.357


In [259]:
subset_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,178.86088,408.39294,179.16069,29.95357,90.4045,2.25344,19.90674,58.06536,2.82201,45.92398,10.84935,456.49066,26.43781,7.79308,881.00683,6.96501,0.30564,1015.66741,0.41716,11.80128
std,166.92805,110.22633,67.30888,19.07784,49.1823,1.85985,7.47876,50.31382,1.38303,19.76118,5.4925,175.35695,9.79886,4.4035,646.8222,7.49054,0.27236,1607.27076,0.2542,23.59605
min,5.95,227.0917,36.95806,0.0,9.48833,0.0,4.10645,0.82311,0.7529,14.17172,0.96885,168.7625,5.40403,2.11712,58.50833,0.0,0.01469,0.0,0.07379,0.0
25%,49.26274,319.14442,130.33095,15.16033,66.04429,0.86444,14.48121,14.57852,1.8649,29.52328,7.19732,328.298,20.55757,4.46101,362.2348,2.8715,0.09049,316.00322,0.24017,2.70574
50%,128.9041,411.3816,175.06245,28.18462,78.21771,1.71311,19.45139,42.36745,2.46431,44.61657,9.73675,430.96845,24.82977,6.80382,809.4858,4.20962,0.1986,578.6398,0.34042,4.82702
75%,250.50267,477.81553,227.21825,43.35413,101.90696,2.90374,25.24648,99.57001,3.82829,60.44118,13.36158,568.87323,34.5045,11.36834,1116.121,7.89653,0.46566,988.02458,0.54473,9.62192
max,676.9761,723.6077,337.9552,77.18266,258.78,8.065,37.55058,158.8797,7.87272,91.56936,26.91518,821.561,49.29967,19.2981,3276.487,33.53006,1.26143,8619.632,1.10675,140.9415


In [260]:
user_recipes_fat = recipe_db[recipe_db.index.isin(subset_fat_id)]
user_recipes_normal = recipe_db[recipe_db.index.isin(subset_normal_id)]

In [261]:
####temp#######
temp_fat_20 = [16966,  14753,  62459,  18397,   8757,  52501,   8556, 240522,
        86628, 221304, 223596,  51653,  48921, 142220,  76763, 222509,
        17496,  14710,  86860,  86813]

user_recipes_fat = recipe_db[recipe_db.index.isin(temp_fat_20)]

temp_normal_20 = [36994,  16741,  14622,  24080,  11720,   8994,  24509,   8949,
       217899,  15225,   8549, 220214,  21140, 230735, 140135,  27946,
       199382, 109782,  30794,  22364]
####temp#######
user_recipes_normal = recipe_db[recipe_db.index.isin(temp_normal_20)]



In [262]:
user_recipes_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14710,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [263]:
#top10 most common ingredients
#16421    2125 salt
#4342     1506 garlic
#4397     1412 onion
#16406    1185 ground black pepper
#16157    1016 butter
#6307      944 olive oil
#6494      786 skinless bone less chicken breast
#2496      749 water
#16238     574 grated Parmesan cheese
#16317     538 eggs


drop_id_list = [16421, 4342, 4397, 16406, 16157, 6307, 6494, 2496, 16238, 16317]





In [264]:
subset_fat_id

array([236703,  87137,   8855,  15024,  82487, 223596,  14610,  23881,
       223005,  16794,  48873, 235171,   8998, 199688,  19484,  17022,
        21352, 148970,   8693, 219766,  69538, 132511,  52299,  18059,
       147305,  11757,   8639,  16756, 156232,  84774, 231537, 213748,
       219173,   8772,  14592,  17496, 169974, 151997,  17169,  16429])

In [265]:
subset_normal_id

array([ 13890,  58348, 240593,  74188,  22671, 148818,  46859,  72613,
       244507,  69754, 223058,  72712,   9037,  75115, 212511,   8990,
       143105,  20618,  72804,  92525,  71251, 219005,  94113, 213497,
        16566,  18883,  23691,  86415,  37022, 231533, 214617,  21076,
        18130,  24851,  17807,   8758,   9143,  21317, 240502,  25146])

In [266]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)

#original
#new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=subset_fat_id)
#original


####temp#######
new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=temp_fat_20)

new_recipe_db_wo_userrecipes_normal = new_recipe_db.drop(axis=0, labels=temp_normal_20)
####temp#######

In [267]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes_fat = user_recipes_fat.drop(axis=1, labels=drop_id_list)
new_user_recipes_normal = user_recipes_normal.drop(axis=1, labels=drop_id_list)

In [268]:
sample_fat = new_user_recipes_fat.sample(n=20).copy()


In [269]:
sample_normal = new_user_recipes_normal.sample(n=20).copy()

In [270]:
sample_normal
sample_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24509,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27946,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11720,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8949,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
140135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16741,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30794,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
220214,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15225,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86628,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223596,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
240522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
221304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [271]:
## Jaccard - fat

result_array = cdist(sample_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=sample_fat.index.values)
# result_2

result_w_filter_10_fat = pd.DataFrame(result_w_filter_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_fat = result_w_filter_10_fat.sort_values(by='jaccard_distance_sum')
result_w_filter_10_fat['jaccard_distance_sum'] = result_w_filter_10_fat['jaccard_distance_sum'].div(20)
result_w_filter_10_fat[0:10]


Unnamed: 0,jaccard_distance_sum
8547,0.92198
188706,0.93331
70513,0.94038
141125,0.94441
62256,0.94456
231808,0.94471
90160,0.94768
240773,0.94804
18442,0.94897
14614,0.94936


In [272]:
## Jaccard - normal

result_array = cdist(sample_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=sample_normal.index.values)
# result_2

result_w_filter_10_normal = pd.DataFrame(result_w_filter_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_normal = result_w_filter_10_normal.sort_values(by='jaccard_distance_sum')
result_w_filter_10_normal['jaccard_distance_sum'] = result_w_filter_10_normal['jaccard_distance_sum'].div(20)
result_w_filter_10_normal[0:10]


Unnamed: 0,jaccard_distance_sum
15502,0.92905
65671,0.93429
178809,0.93591
21699,0.93597
232458,0.93654
82693,0.93869
20456,0.93879
21643,0.93901
86047,0.94013
47006,0.94082


In [273]:
# userrecipes fat
sample_fat.index.to_numpy()

array([ 18397,   8757,  86628, 223596,  48921,  17496, 240522, 221304,
        16966,  52501,  62459,  86813,  14710, 222509,  51653,   8556,
        86860,  14753,  76763, 142220])

In [274]:
# userrecipes normal
sample_normal.index.to_numpy()


array([ 24509,  27946, 109782,  11720,   8949, 140135,  16741,  30794,
       220214,  15225,   8549,  22364, 199382, 217899,  24080,  14622,
       230735,  21140,  36994,   8994])

In [275]:
result_w_filter_10_fat[0:10].index.to_numpy()

array([  8547, 188706,  70513, 141125,  62256, 231808,  90160, 240773,
        18442,  14614])

In [276]:
result_w_filter_10_normal[0:10].index.to_numpy()

array([ 15502,  65671, 178809,  21699, 232458,  82693,  20456,  21643,
        86047,  47006])

In [277]:
result_w_filter_fat_short = result_w_filter_fat.copy()
result_w_filter_normal_short = result_w_filter_normal.copy()

In [278]:
# top 10 fat short
fat_short = pd.DataFrame(np.sort(result_w_filter_fat_short.values, axis=0), index=result_w_filter_fat_short.index, columns=result_w_filter_fat_short.columns)

new_columns = fat_short.columns[fat_short.loc[fat_short.first_valid_index()].argsort()]
result_fat_short = fat_short[new_columns]
result_fat_short = result_fat_short.reset_index()
result_fat_short = result_fat_short.loc[1].to_frame()

result_fat_short[0:11].index.to_numpy()


array(['index', 8547, 188706, 34361, 206120, 220716, 16372, 14525, 235948,
       11678, 25884], dtype=object)

In [279]:
fat_short[new_columns]

Unnamed: 0,8547,188706,34361,206120,220716,16372,14525,235948,11678,25884,...,24038,8933,17815,8938,13952,223218,232907,24160,87053,193219
18397,0.0,0.25,0.33333,0.33333,0.33333,0.46154,0.5,0.5,0.5,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8757,0.9,0.81818,0.875,0.875,0.875,0.8,0.91667,0.90909,0.88889,0.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
86628,0.90909,0.875,1.0,1.0,1.0,0.88235,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
223596,0.90909,0.9,1.0,1.0,1.0,0.95238,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
48921,0.92308,0.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
17496,0.92857,0.92308,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
240522,0.92857,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
221304,0.94118,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
16966,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
52501,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [280]:
sample_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86628,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223596,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
240522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
221304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [281]:
# top 10 normal short
normal_short = pd.DataFrame(np.sort(result_w_filter_normal_short.values, axis=0), index=result_w_filter_normal_short.index, columns=result_w_filter_normal_short.columns)

new_columns = normal_short.columns[normal_short.loc[normal_short.first_valid_index()].argsort()]
result_normal_short = normal_short[new_columns]
result_normal_short = result_normal_short.reset_index()
result_normal_short = result_normal_short.loc[0].to_frame()
result_normal_short[0:11].index.to_numpy()


array(['index', 15502, 14773, 45361, 178809, 86047, 23998, 139948, 19400,
       82693, 23711], dtype=object)

In [282]:
normal_short[new_columns]

Unnamed: 0,15502,14773,45361,178809,86047,23998,139948,19400,82693,23711,...,17576,75141,17625,74949,17745,230247,17862,206120,19919,143146
24509,0.0,0.4,0.4,0.5,0.5,0.5,0.5,0.5,0.5,0.55556,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
27946,0.8,0.875,0.75,0.8,0.84615,0.875,0.875,0.92857,0.66667,0.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
109782,0.91667,0.9,0.83333,0.84615,0.875,0.92308,0.88889,0.9375,0.75,0.91667,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
11720,0.92308,1.0,0.88889,0.84615,0.90909,1.0,1.0,1.0,0.85714,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8949,0.94118,1.0,1.0,0.91667,0.90909,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
140135,1.0,1.0,1.0,0.92857,0.90909,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
16741,1.0,1.0,1.0,0.93333,0.91667,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
30794,1.0,1.0,1.0,0.94737,0.9375,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
220214,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
15225,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [283]:
sample_fat.index.to_numpy()

array([ 18397,   8757,  86628, 223596,  48921,  17496, 240522, 221304,
        16966,  52501,  62459,  86813,  14710, 222509,  51653,   8556,
        86860,  14753,  76763, 142220])

In [284]:
sample_normal.index.to_numpy()

array([ 24509,  27946, 109782,  11720,   8949, 140135,  16741,  30794,
       220214,  15225,   8549,  22364, 199382, 217899,  24080,  14622,
       230735,  21140,  36994,   8994])

In [285]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_fat = sample_fat.copy()
user_one_vector_fat.loc['sum',:]= user_one_vector_fat.sum(axis=0)
user_one_vector_fat = user_one_vector_fat.drop(axis=0, labels=sample_fat.index)
user_one_vector_fat[user_one_vector_fat > 0] = 1

In [286]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_normal = sample_normal.copy()
user_one_vector_normal.loc['sum',:]= user_one_vector_normal.sum(axis=0)
user_one_vector_normal = user_one_vector_normal.drop(axis=0, labels=sample_normal.index)
user_one_vector_normal[user_one_vector_normal > 0] = 1

In [287]:
user_one_vector_normal
user_one_vector_normal.apply(pd.value_counts).count(axis=1)

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0.00000    621
1.00000     99
dtype: int64

In [288]:
user_one_vector_fat
user_one_vector_fat.apply(pd.value_counts).count(axis=1)

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0.00000    626
1.00000     94
dtype: int64

In [289]:
## Jaccard - Vector fat top 10

result_array = cdist(user_one_vector_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_vector_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=user_one_vector_fat.index.values)


result_w_filter_vector_10_fat = pd.DataFrame(result_w_filter_vector_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_fat = result_w_filter_vector_10_fat.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_fat[0:10]


Unnamed: 0,jaccard_distance_sum
51850,0.88889
231396,0.89796
135885,0.9
52299,0.9
145843,0.90426
16372,0.90526
83083,0.90909
180905,0.90909
24682,0.91489
70012,0.91579


In [290]:
## Jaccard - Vector normal top 10

result_array = cdist(user_one_vector_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_vector_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=user_one_vector_normal.index.values)


result_w_filter_vector_10_normal = pd.DataFrame(result_w_filter_vector_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_normal = result_w_filter_vector_10_normal.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_normal[0:10]


Unnamed: 0,jaccard_distance_sum
9005,0.89109
53075,0.90196
231396,0.90291
111905,0.90385
22991,0.90909
178809,0.90909
8758,0.91089
228266,0.91089
236114,0.91176
83083,0.91346


In [291]:
result_w_filter_vector_10_normal[0:10].index.to_numpy()

array([  9005,  53075, 231396, 111905,  22991, 178809,   8758, 228266,
       236114,  83083])

In [292]:
####### results

In [293]:
# userrecipes fat
sample_fat.index.to_numpy()

array([ 18397,   8757,  86628, 223596,  48921,  17496, 240522, 221304,
        16966,  52501,  62459,  86813,  14710, 222509,  51653,   8556,
        86860,  14753,  76763, 142220])

In [294]:
###### jaccard fat top 10 naiv
result_w_filter_10_fat[0:10].index.to_numpy()

array([  8547, 188706,  70513, 141125,  62256, 231808,  90160, 240773,
        18442,  14614])

In [295]:
###### jaccard kurz fat top 10
result_fat_short[1:11].index.to_numpy()


array([8547, 188706, 34361, 206120, 220716, 16372, 14525, 235948, 11678,
       25884], dtype=object)

In [296]:
###### jaccard vector fat top 10
result_w_filter_vector_10_fat[0:10].index.to_numpy()

array([ 51850, 231396, 135885,  52299, 145843,  16372,  83083, 180905,
        24682,  70012])

In [297]:
# userrecipes normal
sample_normal.index.to_numpy()

array([ 24509,  27946, 109782,  11720,   8949, 140135,  16741,  30794,
       220214,  15225,   8549,  22364, 199382, 217899,  24080,  14622,
       230735,  21140,  36994,   8994])

In [298]:
###### jaccard normal top 10 naiv
result_w_filter_10_normal[0:10].index.to_numpy()

array([ 15502,  65671, 178809,  21699, 232458,  82693,  20456,  21643,
        86047,  47006])

In [299]:
###### jaccard kurz normal top 10
result_normal_short[1:11].index.to_numpy()


array([15502, 14773, 45361, 178809, 86047, 23998, 139948, 19400, 82693,
       23711], dtype=object)

In [300]:
###### jaccard vector normal top 10
result_w_filter_vector_10_normal[0:10].index.to_numpy()

array([  9005,  53075, 231396, 111905,  22991, 178809,   8758, 228266,
       236114,  83083])

In [301]:
##### random top 10
subset_random_top10.index.to_numpy()

array([ 26607, 165079,  62423, 220149,  74037,  60022,  54215,  11733,
        14618, 152156])

In [302]:
nutrition_db2.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0
mean,147.65186,437.44356,206.11989,29.13837,97.10705,2.59745,22.90221,60.52532,3.15292,51.41953,12.44388,532.06962,28.05446,8.74674,851.15449,6.7255,0.30428,1169.01319,0.50992,16.28996
std,160.00437,183.08939,125.84039,22.82783,60.47445,2.51004,13.98227,59.87693,2.44362,29.11251,6.8939,267.49221,12.01243,6.80052,643.6215,8.81425,0.30165,2322.11124,0.31208,24.36018
min,2.247,51.34326,1.323,0.0,0.0,0.0,0.147,0.0,0.19784,0.99745,0.03511,21.35641,0.7505,0.01992,5.20206,0.0,0.00055,0.0,0.00497,0.0
25%,41.91497,302.9718,116.5374,10.22622,60.372,0.74534,12.9486,15.88967,1.81434,33.22178,7.40249,345.4748,19.97439,3.62848,418.6246,1.76465,0.10514,236.8913,0.2845,2.15387
50%,83.1685,407.2476,182.7628,24.77198,85.12875,1.94552,20.30698,38.70226,2.76635,45.93,11.16012,480.7832,26.86635,7.16552,734.007,3.90585,0.19772,581.4108,0.4463,6.66049
75%,193.6033,541.5573,271.8885,43.2574,126.737,3.66472,30.20983,88.10015,3.93263,63.26534,16.56103,654.2805,34.32086,12.13782,1093.027,7.83809,0.39695,1113.579,0.68318,19.57264
max,1264.326,1828.192,989.103,236.7205,708.85,27.83175,109.9003,519.5359,55.89075,590.3922,47.57578,1931.488,99.72639,59.31002,7322.289,104.8181,2.35457,38664.7,3.83736,270.4131


In [303]:
new_df = nutrition_db2[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [304]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0
mean,437.44356,206.11989,29.13837,22.90221,8.74674,28.05446,147.65186,51.41953,3.15292
std,183.08939,125.84039,22.82783,13.98227,6.80052,12.01243,160.00437,29.11251,2.44362
min,51.34326,1.323,0.0,0.147,0.01992,0.7505,2.247,0.99745,0.19784
25%,302.9718,116.5374,10.22622,12.9486,3.62848,19.97439,41.91497,33.22178,1.81434
50%,407.2476,182.7628,24.77198,20.30698,7.16552,26.86635,83.1685,45.93,2.76635
75%,541.5573,271.8885,43.2574,30.20983,12.13782,34.32086,193.6033,63.26534,3.93263
max,1828.192,989.103,236.7205,109.9003,59.31002,99.72639,1264.326,590.3922,55.89075


In [305]:
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t1/?report=objectonly
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t5/?report=objectonly
#männer alter 31 - 50 pro tag geteil durch 3 mahlzeiten
#angepasst an einen keto (low/carb) dietplan https://www.healthline.com/nutrition/keto-diet-meal-plan-and-menu#bottom-line


recommenden_nut_low_carb = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)


recommenden_nut_low_carb.loc['index'] = [
    # "Calcium":[1],
    266.67,
    # "Calories":[1],
    600,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    5,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    5,
    # "Fat":[1],16.67,
    50,
    # "Folate":[1],
    106.67, 
    # "Iron":[1],
    2,
    # "Magnesium":[1],
    116.67, 
    # "Niacin Equivalents":[1],
    4,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    30,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    0.3, 
    # "Vitamin A - IU":[1],
    208, 
    # "Vitamin B6":[1],
    0.36, 
    # "Vitamin C":[1]}
    25] 

In [306]:
no_weight = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_carbs = [
    # "Calcium":[1],
    10,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    100000,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    100,
    # "Fat":[1],
    10000,
    # "Folate":[1],
    1,
    # "Iron":[1],
    10,
    # "Magnesium":[1],
    10,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    100,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]



In [307]:
result_w_filter_normal_100_vector = result_w_filter_vector_10_normal[0:100].copy()
result_w_filter_normal_100_vector['pos'] = np.arange(len(result_w_filter_normal_100_vector))
result_w_filter_normal_100_vector = result_w_filter_normal_100_vector.reset_index()
result_w_filter_normal_100_vector = result_w_filter_normal_100_vector.set_index('index')

In [308]:
result_w_filter_normal_100 = result_w_filter_10_normal[0:100].copy()
result_w_filter_normal_100['pos'] = np.arange(len(result_w_filter_normal_100))
result_w_filter_normal_100 = result_w_filter_normal_100.reset_index()
result_w_filter_normal_100 = result_w_filter_normal_100.set_index('index')

In [309]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_normal_100.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_normal_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_normal_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_normal_naiv.columns = nutrition_normal_naiv.columns.droplevel(0)

# entferne alle NA
nutrition_normal_naiv = nutrition_normal_naiv.dropna()

In [397]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_carb, nutrition_normal_naiv, 'minkowski', p=2, w=weighted_carbs)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_normal_naiv.index.values, index=recommenden_nut_low_carb.index.values)

euclid_distance_sum_normal_naiv = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_normal_naiv = euclid_distance_sum_normal_naiv.sort_values(by='euclid_distance_sum')
euclid_distance_sum_normal_naiv[0:10]

Unnamed: 0,euclid_distance_sum
34957,1233.45803
229277,1798.47782
15005,1909.37895
228363,2280.15207
8529,2721.79205
234803,2815.07624
54839,3115.41086
16628,3125.59868
15196,3263.59136
232326,3280.71921


In [398]:
jaccard_euclid_joined_normal_naiv = pd.merge(euclid_distance_sum_normal_naiv, result_w_filter_normal_100, left_index=True, right_index=True)

In [399]:
jaccard_euclid_joined_normal_naiv[0:10]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
34957,1233.45803,0.95408,71
229277,1798.47782,0.95382,68
15005,1909.37895,0.95566,95
228363,2280.15207,0.956,99
8529,2721.79205,0.95555,92
234803,2815.07624,0.9521,51
54839,3115.41086,0.95405,70
16628,3125.59868,0.95426,74
15196,3263.59136,0.94907,31
232326,3280.71921,0.95581,96


In [400]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined_normal_naiv[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_normal_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_normal_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_normal_naiv.columns = nutrition_top10_normal_naiv.columns.droplevel(0)

# entferne alle NA
nutrition_top10_normal_naiv = nutrition_top10_normal_naiv.dropna()

In [445]:
nutrition_top10_normal_naiv.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,106.87714,482.33193,296.96785,6.24093,144.33067,0.73464,32.99643,18.75976,2.5489,43.9174,18.1984,478.9616,38.199,12.23374,577.98494,1.7782,0.16193,686.92352,0.70366,6.22223
std,96.82942,163.5059,111.3513,3.09023,56.93486,0.27223,12.37237,8.78879,1.09965,14.35581,7.73815,88.47135,10.94609,5.39099,388.63266,2.0924,0.17771,449.94769,0.20658,6.86794
min,16.1916,295.9387,170.0177,1.62752,75.978,0.33416,18.89085,5.81638,1.28493,25.5137,8.07638,298.312,22.41871,7.30422,61.91653,0.34573,0.00889,58.64625,0.40378,1.30604
25%,32.58996,375.88225,221.3296,4.42001,100.91009,0.59805,24.59217,13.75998,1.59378,33.82286,12.66763,433.022,30.76304,8.35387,231.14497,0.64539,0.08949,391.12445,0.55453,1.53589
50%,50.84447,453.6113,269.8317,5.50791,133.2094,0.72191,29.9813,15.75412,2.56422,42.00024,16.18116,490.2364,37.6632,10.06997,628.30975,1.21789,0.11951,700.7361,0.67594,4.04875
75%,187.00457,561.88357,364.77368,8.45709,176.63625,0.80505,40.53041,21.66105,3.02987,51.23097,23.09417,531.8824,42.32862,14.07311,859.29552,1.94705,0.17052,946.4348,0.82346,6.0093
max,275.6205,835.7366,520.6937,11.35139,252.1156,1.15985,57.85485,35.04619,4.75413,71.06262,31.27942,613.7807,58.66879,21.68964,1115.723,7.40242,0.63653,1349.647,1.04302,22.88027


In [402]:
#nutrition_top10_normal_naiv.describe()

In [403]:
nutrition_top10_normal_naiv.index.to_numpy()

array([  8529,  15005,  15196,  16628,  34957,  54839, 228363, 229277,
       232326, 234803])

In [404]:
new_df = nutrition_top10_normal_naiv[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [446]:
new_df = nutrition_top10_normal_naiv[['Calories', 'Carbohydrates', 'Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [447]:
new_df.describe()

name,Calories,Carbohydrates,Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,482.33193,6.24093,32.99643,38.199,106.87714,43.9174,2.5489
std,163.5059,3.09023,12.37237,10.94609,96.82942,14.35581,1.09965
min,295.9387,1.62752,18.89085,22.41871,16.1916,25.5137,1.28493
25%,375.88225,4.42001,24.59217,30.76304,32.58996,33.82286,1.59378
50%,453.6113,5.50791,29.9813,37.6632,50.84447,42.00024,2.56422
75%,561.88357,8.45709,40.53041,42.32862,187.00457,51.23097,3.02987
max,835.7366,11.35139,57.85485,58.66879,275.6205,71.06262,4.75413


In [406]:
result_w_filter_normal_100_short = result_normal_short[1:101].index.to_numpy().copy()


In [407]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_normal_short[1:101].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_normal_short = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_normal_short.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_normal_short.columns = nutrition_normal_short.columns.droplevel(0)

# entferne alle NA
nutrition_normal_short = nutrition_normal_short.dropna()

In [408]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_carb, nutrition_normal_short, 'minkowski', p=2, w=weighted_carbs)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_normal_short.index.values, index=recommenden_nut_low_carb.index.values)

euclid_distance_sum_normal_short = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_normal_short = euclid_distance_sum_normal_short.sort_values(by='euclid_distance_sum')
euclid_distance_sum_normal_short[0:10]

Unnamed: 0,euclid_distance_sum
26595,2009.54974
60014,2677.20928
103067,2697.75648
8529,2721.79205
8647,2813.57577
22538,3177.01471
222850,3190.28378
15196,3263.59136
16285,3329.39161
93168,3347.55653


In [409]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(euclid_distance_sum_normal_short[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_normal_short = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_normal_short.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_normal_short.columns = nutrition_top10_normal_short.columns.droplevel(0)

# entferne alle NA
nutrition_top10_normal_short = nutrition_top10_normal_short.dropna()

In [410]:
nutrition_top10_normal_short.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,68.80492,447.48073,272.40118,4.59262,139.86272,0.54333,30.2668,18.43878,2.7067,39.27815,15.33525,454.22718,36.67739,11.78398,541.7687,0.81067,0.12733,464.28478,0.60538,3.20152
std,86.8675,177.52774,116.64413,4.32238,54.57713,0.43692,12.96046,9.33691,1.24197,15.62335,8.34864,160.23708,13.6839,4.94432,433.19612,1.01743,0.08008,377.72088,0.34964,3.09538
min,12.927,327.2985,169.6445,0.00852,74.542,0.0,18.84939,7.90259,0.85171,16.818,4.50316,218.968,18.92661,7.38248,49.399,0.00852,0.00245,0.793,0.20433,0.0
25%,21.91103,337.5829,204.19575,0.99506,109.20473,0.1985,22.68842,12.243,1.8097,28.98001,9.99713,352.547,30.34238,8.38699,214.8854,0.17677,0.0726,119.6934,0.36688,0.76822
50%,34.58058,382.14785,231.032,3.00389,125.3364,0.45514,25.67022,14.8928,2.50389,35.83729,12.18893,443.75495,33.06036,9.85059,450.49465,0.54773,0.11704,415.0752,0.5392,2.56657
75%,45.81831,414.17195,255.51118,8.32624,140.67927,0.74923,28.39013,24.28613,3.60548,49.37852,21.50244,525.28398,37.59506,13.68141,821.03265,0.90868,0.18861,830.99608,0.72379,5.29234
max,275.6205,835.7366,520.6937,11.35139,252.1156,1.34991,57.85485,35.04619,4.74324,71.06262,31.27942,767.248,61.8999,21.41816,1310.922,3.44844,0.26951,953.9421,1.29291,9.42897


In [411]:
nutrition_top10_normal_short.index.to_numpy()

array([  8529,   8647,  15196,  16285,  22538,  26595,  60014,  93168,
       103067, 222850])

In [412]:
new_df = nutrition_top10_normal_short[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [450]:
new_df = nutrition_top10_normal_short[['Calories', 'Carbohydrates', 'Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [451]:
new_df.describe()

name,Calories,Carbohydrates,Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,447.48073,4.59262,30.2668,36.67739,68.80492,39.27815,2.7067
std,177.52774,4.32238,12.96046,13.6839,86.8675,15.62335,1.24197
min,327.2985,0.00852,18.84939,18.92661,12.927,16.818,0.85171
25%,337.5829,0.99506,22.68842,30.34238,21.91103,28.98001,1.8097
50%,382.14785,3.00389,25.67022,33.06036,34.58058,35.83729,2.50389
75%,414.17195,8.32624,28.39013,37.59506,45.81831,49.37852,3.60548
max,835.7366,11.35139,57.85485,61.8999,275.6205,71.06262,4.74324


In [414]:
result_w_filter_normal_100_vector = result_w_filter_vector_10_normal[0:100].copy()
result_w_filter_normal_100_vector['pos'] = np.arange(len(result_w_filter_normal_100_vector))
result_w_filter_normal_100_vector = result_w_filter_normal_100_vector.reset_index()
result_w_filter_normal_100_vector = result_w_filter_normal_100_vector.set_index('index')

In [415]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_normal_100_vector.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_normal_vector = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_normal_vector.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_normal_vector.columns = nutrition_normal_vector.columns.droplevel(0)

# entferne alle NA
nutrition_normal_vector = nutrition_normal_vector.dropna()

In [416]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_carb, nutrition_normal_vector, 'minkowski', p=2, w=weighted_carbs)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_normal_vector.index.values, index=recommenden_nut_low_carb.index.values)

euclid_distance_sum_normal_vector = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_normal_vector = euclid_distance_sum_normal_vector.sort_values(by='euclid_distance_sum')
euclid_distance_sum_normal_vector[0:10]

Unnamed: 0,euclid_distance_sum
166292,1476.6534
228363,2280.15207
8529,2721.79205
87497,2850.20461
54839,3115.41086
83421,3305.17612
68860,3350.97467
20606,3420.04183
17496,3581.64293
223047,3631.94865


In [417]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(euclid_distance_sum_normal_vector[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_normal_vector = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_normal_vector.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_normal_vector.columns = nutrition_top10_normal_vector.columns.droplevel(0)

# entferne alle NA
nutrition_top10_normal_vector = nutrition_top10_normal_vector.dropna()

In [418]:
nutrition_top10_normal_vector.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,58.25634,460.14029,287.45059,7.16126,124.91453,0.89356,31.93895,23.30154,2.6578,43.33581,14.69293,516.83587,33.87165,9.97761,535.62256,3.15896,0.32511,743.66754,0.64256,12.10611
std,52.39662,168.32816,128.14339,4.55408,63.65503,0.39959,14.23815,17.12503,1.47214,18.813,7.15917,240.62088,12.43708,5.33965,330.02773,3.06189,0.39029,388.97786,0.34912,13.78661
min,23.03594,260.3773,140.9687,1.62752,38.74166,0.34361,15.66319,3.34426,0.93527,18.25972,6.15597,163.2268,16.05327,3.62848,107.2473,0.34573,0.04188,164.9133,0.11614,0.32614
25%,24.79795,343.69515,193.1802,3.22905,90.9725,0.75806,21.46447,12.88401,1.75519,27.87755,9.75177,386.66352,23.55742,6.19428,393.16147,1.05003,0.11545,482.58132,0.35945,1.89688
50%,30.76448,435.63765,260.1507,7.31095,119.8735,0.84504,28.90564,14.45852,2.22705,42.00024,13.23853,489.73535,35.84529,8.55771,451.62405,1.54642,0.14386,728.8506,0.67594,6.69192
75%,82.08885,540.1074,336.57962,10.4685,128.6417,1.08796,37.39774,34.97927,3.43663,58.27775,17.17041,657.3797,40.21294,12.92636,612.02082,6.07483,0.33072,922.42015,0.8348,17.51722
max,177.3457,835.7366,520.6937,14.76202,252.1156,1.74117,57.85485,57.63073,5.25145,71.06262,31.27942,915.8073,58.66879,21.41816,1115.723,7.65177,1.3176,1349.647,1.16962,37.63184


In [419]:
nutrition_top10_normal_vector.index.to_numpy()

array([  8529,  17496,  20606,  54839,  68860,  83421,  87497, 166292,
       223047, 228363])

In [420]:
new_df = nutrition_top10_normal_vector[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [452]:
new_df = nutrition_top10_normal_vector[['Calories', 'Carbohydrates', 'Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [453]:
new_df.describe()

name,Calories,Carbohydrates,Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,460.14029,7.16126,31.93895,33.87165,58.25634,43.33581,2.6578
std,168.32816,4.55408,14.23815,12.43708,52.39662,18.813,1.47214
min,260.3773,1.62752,15.66319,16.05327,23.03594,18.25972,0.93527
25%,343.69515,3.22905,21.46447,23.55742,24.79795,27.87755,1.75519
50%,435.63765,7.31095,28.90564,35.84529,30.76448,42.00024,2.22705
75%,540.1074,10.4685,37.39774,40.21294,82.08885,58.27775,3.43663
max,835.7366,14.76202,57.85485,58.66879,177.3457,71.06262,5.25145


In [422]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_10_normal.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

random_nut_normal_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

random_nut_normal_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
random_nut_normal_naiv.columns = random_nut_normal_naiv.columns.droplevel(0)

# entferne alle NA
random_nut_normal_naiv = random_nut_normal_naiv.dropna()


In [423]:
asd = random_nut_normal_naiv.sort_values(by='Carbohydrates')

In [424]:
asd[0:10].describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,28.44493,331.86876,176.97565,0.05063,114.70602,0.01441,19.66396,8.42683,4.36869,36.17567,15.46108,402.9973,36.39433,6.78872,405.11665,0.0156,0.23332,155.01973,0.62126,0.24158
std,20.62043,175.52979,107.73843,0.06518,61.25709,0.02899,11.97093,6.59899,4.9092,15.28027,6.40235,204.26726,18.57905,5.00701,425.0868,0.03643,0.27807,186.02701,0.33419,0.47315
min,5.95,190.5575,86.2227,0.0,67.15,0.0,9.5803,1.985,0.79819,18.82775,7.12442,198.085,22.9838,1.72994,58.50833,0.0,0.05391,0.0,0.23219,0.0
25%,14.04347,227.09988,117.3418,0.00497,71.74249,0.0,13.03798,3.82825,1.34672,22.70406,9.58787,238.43688,23.90248,4.0709,160.13387,0.0,0.0614,6.94487,0.37442,0.0
50%,18.70479,236.203,131.50655,0.0122,83.84475,0.0,14.61184,5.51012,2.88077,33.84019,15.31916,350.73045,26.22061,5.61383,202.2324,0.0036,0.10425,76.44315,0.5154,0.02083
75%,43.72512,412.58407,190.6126,0.08467,157.77746,0.01425,21.17918,13.34584,4.15801,49.74599,21.32081,548.3556,52.77548,7.78108,598.2415,0.00852,0.23585,305.53745,0.83336,0.12693
max,63.28278,710.579,450.4628,0.16583,218.69,0.08948,50.05142,20.586,16.5344,59.27167,24.10592,767.248,66.38522,19.09217,1418.206,0.11844,0.84263,494.7276,1.29291,1.44434


In [425]:
random_nut_normal = nutrition_db2.sort_values(by='Carbohydrates')

In [426]:
random_nut_normal[0:10].index.to_numpy()

array([ 94113,  24035,  21422, 239137,  26595,  25147,  13662, 240208,
         8690,  21749])

In [427]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(random_nut_normal[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

random_nut_normal = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

random_nut_normal.set_index('id', inplace=True)

# remove multiindex 'amount'
random_nut_normal.columns = random_nut_normal.columns.droplevel(0)

# entferne alle NA
random_nut_normal = random_nut_normal.dropna()

In [428]:
random_nut_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,28.44493,331.86876,176.97565,0.05063,114.70602,0.01441,19.66396,8.42683,4.36869,36.17567,15.46108,402.9973,36.39433,6.78872,405.11665,0.0156,0.23332,155.01973,0.62126,0.24158
std,20.62043,175.52979,107.73843,0.06518,61.25709,0.02899,11.97093,6.59899,4.9092,15.28027,6.40235,204.26726,18.57905,5.00701,425.0868,0.03643,0.27807,186.02701,0.33419,0.47315
min,5.95,190.5575,86.2227,0.0,67.15,0.0,9.5803,1.985,0.79819,18.82775,7.12442,198.085,22.9838,1.72994,58.50833,0.0,0.05391,0.0,0.23219,0.0
25%,14.04347,227.09988,117.3418,0.00497,71.74249,0.0,13.03798,3.82825,1.34672,22.70406,9.58787,238.43688,23.90248,4.0709,160.13387,0.0,0.0614,6.94487,0.37442,0.0
50%,18.70479,236.203,131.50655,0.0122,83.84475,0.0,14.61184,5.51012,2.88077,33.84019,15.31916,350.73045,26.22061,5.61383,202.2324,0.0036,0.10425,76.44315,0.5154,0.02083
75%,43.72512,412.58407,190.6126,0.08467,157.77746,0.01425,21.17918,13.34584,4.15801,49.74599,21.32081,548.3556,52.77548,7.78108,598.2415,0.00852,0.23585,305.53745,0.83336,0.12693
max,63.28278,710.579,450.4628,0.16583,218.69,0.08948,50.05142,20.586,16.5344,59.27167,24.10592,767.248,66.38522,19.09217,1418.206,0.11844,0.84263,494.7276,1.29291,1.44434


In [429]:
asd[0:50]

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
24035,20.09839,243.25,132.2323,0.0,82.3775,0.0,14.69248,1.985,8.76073,34.13264,11.2662,347.7757,25.87448,4.8282,714.588,0.0,0.84263,6.9475,0.55679,0.09925
94113,5.95,227.0917,113.7937,0.0,71.39999,0.0,12.64375,6.94167,1.82467,23.8,8.41925,358.9833,26.56675,4.9385,58.50833,0.0,0.07933,0.0,0.34708,0.0
21422,63.28278,536.2064,252.7438,0.00378,207.1225,0.0,28.08264,16.71306,4.22666,59.27167,24.10592,662.6006,66.38522,9.94302,197.2328,0.00378,0.13487,157.5758,0.96802,0.0
239137,15.165,227.1244,127.9861,0.00851,95.10313,0.0,14.22068,3.46563,0.79819,22.33875,15.03951,198.085,23.69579,8.05543,719.33,0.00851,0.05391,370.5056,0.45643,0.0
26595,48.208,710.579,450.4628,0.00852,218.69,0.0,50.05142,20.586,3.95205,51.804,23.22978,767.248,61.8999,19.09217,207.232,0.00852,0.26951,354.858,1.29291,0.0
25147,30.2765,269.6862,169.1788,0.01588,85.312,0.0,18.79765,5.8725,1.50648,18.82775,9.02842,353.6852,23.38481,6.95803,196.0045,0.0,0.63369,6.944,0.23219,0.6944
13662,57.96445,460.2167,197.7572,0.05369,178.6689,0.03567,21.97302,15.48056,3.93688,54.69722,22.24237,611.4797,61.23595,6.28917,148.177,0.00341,0.1242,5.7525,0.89435,0.04167
240208,12.52339,190.5575,86.2227,0.095,67.15,0.019,9.5803,3.16,16.5344,43.57196,15.59881,215.522,24.52255,1.72994,1418.206,0.0,0.0553,16.59,0.474,0.0
8690,13.66963,229.156,130.7808,0.15506,72.77,0.08948,14.5312,5.14774,1.29347,19.76499,7.12442,207.412,22.9838,3.81847,142.6859,0.01336,0.05543,136.2963,0.34048,0.13616
21749,17.31119,224.8197,108.5983,0.16583,68.46621,0.0,12.06648,4.91614,0.85337,33.54774,18.55613,307.1815,27.39408,2.23425,249.202,0.11844,0.0843,494.7276,0.65039,1.44434


In [430]:
asd[0:10].index.to_numpy()

array([ 24035,  94113,  21422, 239137,  26595,  25147,  13662, 240208,
         8690,  21749])

In [431]:
random_nut_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,28.44493,331.86876,176.97565,0.05063,114.70602,0.01441,19.66396,8.42683,4.36869,36.17567,15.46108,402.9973,36.39433,6.78872,405.11665,0.0156,0.23332,155.01973,0.62126,0.24158
std,20.62043,175.52979,107.73843,0.06518,61.25709,0.02899,11.97093,6.59899,4.9092,15.28027,6.40235,204.26726,18.57905,5.00701,425.0868,0.03643,0.27807,186.02701,0.33419,0.47315
min,5.95,190.5575,86.2227,0.0,67.15,0.0,9.5803,1.985,0.79819,18.82775,7.12442,198.085,22.9838,1.72994,58.50833,0.0,0.05391,0.0,0.23219,0.0
25%,14.04347,227.09988,117.3418,0.00497,71.74249,0.0,13.03798,3.82825,1.34672,22.70406,9.58787,238.43688,23.90248,4.0709,160.13387,0.0,0.0614,6.94487,0.37442,0.0
50%,18.70479,236.203,131.50655,0.0122,83.84475,0.0,14.61184,5.51012,2.88077,33.84019,15.31916,350.73045,26.22061,5.61383,202.2324,0.0036,0.10425,76.44315,0.5154,0.02083
75%,43.72512,412.58407,190.6126,0.08467,157.77746,0.01425,21.17918,13.34584,4.15801,49.74599,21.32081,548.3556,52.77548,7.78108,598.2415,0.00852,0.23585,305.53745,0.83336,0.12693
max,63.28278,710.579,450.4628,0.16583,218.69,0.08948,50.05142,20.586,16.5344,59.27167,24.10592,767.248,66.38522,19.09217,1418.206,0.11844,0.84263,494.7276,1.29291,1.44434


In [432]:
subset_fat_asd = nutrition_fat_naiv[(nutrition_fat_naiv['Fat'] >= 10) & (nutrition_fat_naiv['Fat'] <= 15)].sample(n=8).copy()

In [433]:
subset_fat_asd.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,162.80304,334.50117,123.01987,32.78413,67.18158,4.00545,13.66888,91.52332,3.08246,48.43924,8.95656,608.06612,20.27056,5.84605,617.81806,6.73912,0.2728,1990.47048,0.43365,32.21983
std,118.70006,86.45515,11.0902,19.87579,11.43892,2.94701,1.23224,70.15791,1.11092,20.77865,2.33444,464.87719,4.59847,1.50548,349.37536,4.284,0.21137,4074.32996,0.31372,35.18379
min,42.2215,200.5096,96.83125,8.50724,53.31907,0.18479,10.75903,19.02514,1.61694,22.19329,6.48622,202.2782,11.25829,3.22836,132.1991,2.23212,0.06435,124.5866,0.16549,0.24869
25%,88.1635,287.39178,122.28388,19.57224,55.15042,2.41122,13.5871,54.77253,2.16691,37.15158,7.09761,287.17927,18.83971,5.27881,422.6006,3.92589,0.1452,405.76583,0.24251,1.50123
50%,117.79295,338.0464,127.1827,27.61447,71.05518,2.62511,14.13142,63.03995,3.1824,47.48286,8.56762,491.94015,21.3981,5.76511,665.69385,6.00521,0.20866,592.38995,0.35662,19.70184
75%,212.52058,394.0836,128.54217,54.38408,72.83828,6.15036,14.28247,106.04683,3.77064,54.7819,10.08627,733.3768,22.93564,6.74809,836.35553,8.35532,0.3526,786.9942,0.50485,61.95011
max,348.3654,451.0129,131.3139,55.97997,83.48,8.341,14.59043,232.0606,4.67316,88.1089,13.20469,1621.572,25.49099,8.03736,1054.259,15.31507,0.71992,12034.2,1.12891,85.117


In [434]:
new_df = subset_fat_asd[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [435]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,334.50117,123.01987,32.78413,13.66888,5.84605,20.27056,162.80304,48.43924,3.08246
std,86.45515,11.0902,19.87579,1.23224,1.50548,4.59847,118.70006,20.77865,1.11092
min,200.5096,96.83125,8.50724,10.75903,3.22836,11.25829,42.2215,22.19329,1.61694
25%,287.39178,122.28388,19.57224,13.5871,5.27881,18.83971,88.1635,37.15158,2.16691
50%,338.0464,127.1827,27.61447,14.13142,5.76511,21.3981,117.79295,47.48286,3.1824
75%,394.0836,128.54217,54.38408,14.28247,6.74809,22.93564,212.52058,54.7819,3.77064
max,451.0129,131.3139,55.97997,14.59043,8.03736,25.49099,348.3654,88.1089,4.67316


In [436]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(new_recipe_db_wo_userrecipes_normal.sample(n=10, random_state=0).index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_random_10_normal = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_random_10_normal.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_random_10_normal.columns = nutrition_random_10_normal.columns.droplevel(0)

# entferne alle NA
nutrition_random_10_normal = nutrition_random_10_normal.dropna()

In [437]:
nutrition_random_10_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,97.28871,435.1929,209.15793,25.31728,121.74253,2.06578,23.23977,56.75383,3.1008,50.78209,14.13487,484.92607,29.764,7.03595,979.85422,5.79723,0.21942,2887.82834,0.52851,13.16883
std,78.7334,142.26302,157.57367,19.92788,54.34168,1.77159,17.50818,48.25503,1.33899,27.54426,6.05306,260.57846,8.83564,4.18763,1096.05323,9.15937,0.10799,7527.37123,0.24171,16.89636
min,21.47987,320.1004,87.80512,2.57644,41.49303,0.06054,9.75612,6.5138,1.79438,26.28246,8.50346,262.1083,22.02634,1.88752,254.6976,0.1345,0.08308,79.34875,0.22895,0.0
25%,56.13166,365.6666,134.46865,11.1253,82.39769,0.77895,14.94096,30.26054,1.97259,33.68068,9.86245,299.20815,25.25222,3.8111,315.61507,1.63075,0.17333,298.29765,0.39468,1.48278
50%,68.26243,381.7241,178.18075,20.2482,114.26835,1.67803,19.79786,48.06017,2.68679,39.11582,12.86435,352.3334,27.10081,6.05637,710.78105,2.42703,0.21086,526.89785,0.4732,5.72612
75%,100.03805,459.5137,194.8189,31.70747,173.22925,2.70763,21.64654,69.66164,4.10871,64.17368,16.34769,703.39065,30.76011,9.76885,1103.39802,6.2242,0.25496,795.3114,0.55954,17.27151
max,264.3886,808.1388,635.5994,62.10049,194.7796,5.4829,70.62215,172.2798,5.74535,114.6201,28.49606,893.3475,53.39647,13.92243,3930.284,30.8437,0.45015,24293.95,1.03521,43.643


In [438]:
new_df = nutrition_random_10_normal[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [439]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,435.1929,209.15793,25.31728,23.23977,7.03595,29.764,97.28871,50.78209,3.1008
std,142.26302,157.57367,19.92788,17.50818,4.18763,8.83564,78.7334,27.54426,1.33899
min,320.1004,87.80512,2.57644,9.75612,1.88752,22.02634,21.47987,26.28246,1.79438
25%,365.6666,134.46865,11.1253,14.94096,3.8111,25.25222,56.13166,33.68068,1.97259
50%,381.7241,178.18075,20.2482,19.79786,6.05637,27.10081,68.26243,39.11582,2.68679
75%,459.5137,194.8189,31.70747,21.64654,9.76885,30.76011,100.03805,64.17368,4.10871
max,808.1388,635.5994,62.10049,70.62215,13.92243,53.39647,264.3886,114.6201,5.74535


In [440]:
nutrition_random_10_normal.index.to_numpy()

array([  8600,  16862,  26637,  27987,  31045,  75790, 142951, 158429,
       177497, 216942])

In [441]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(user_recipes_normal.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_user_recipes_normal = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_user_recipes_normal.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_user_recipes_normal.columns = nutrition_user_recipes_normal.columns.droplevel(0)

# entferne alle NA
nutrition_user_recipes_normal = nutrition_user_recipes_normal.dropna()

In [448]:
new_df = nutrition_user_recipes_normal[['Calories', 'Carbohydrates', 'Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [449]:
new_df.describe()

name,Calories,Carbohydrates,Fat,Protein,Calcium,Magnesium,Iron
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,476.6057,30.19359,26.67188,29.13993,191.78426,49.42357,2.65442
std,195.5345,22.57396,16.94325,9.20861,177.61952,22.1068,1.63424
min,180.5461,2.309,1.71964,17.08666,14.10812,20.00892,0.44957
25%,335.87992,12.73448,13.65864,22.54001,33.60625,33.04839,1.66485
50%,450.42955,25.43967,22.82371,27.91984,117.12005,44.08357,2.35468
75%,649.80575,52.06238,40.61526,33.67842,315.03085,57.96935,2.77898
max,804.3319,79.48119,54.52096,49.44593,515.9617,100.5429,7.47058


In [444]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_10_normal[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_jaccard_normal = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_jaccard_normal.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_jaccard_normal.columns = nutrition_jaccard_normal.columns.droplevel(0)

# entferne alle NA
nutrition_jaccard_normal = nutrition_jaccard_normal.dropna()