In [1]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [2]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [3]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [4]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [5]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [6]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [7]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [8]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [983]:
# random
subset_random_top10 = nutrition_db2.sample(n=10).copy()

In [801]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=40).copy()
subset_fat_id = subset_fat.index.to_numpy()


In [802]:
subset_normal = nutrition_db2.sample(n=40).copy()
subset_normal_id = subset_normal.index.to_numpy()

In [803]:
subset_normal.index.to_numpy()

array([ 43272,  30794, 220214,  14622, 140135,  20618,  19335,  15378,
        88495,  21557,   8720,  18416,  16663,  15225,  88086, 202951,
       166506,  16741,   8994,   8949,  45580,  11720, 109782, 199382,
         8549,  79301, 217899,  59988,  36994,  24509, 150251,  22364,
        21140, 230735,  16650,  17869,  24080,  24368,  52608,  27946])

In [847]:
subset_fat.index.to_numpy()

array([ 17496, 221304,  62459,  86860, 214478, 125921,   8613,  21352,
        51653, 164208,   8679,  16563,  16966,  48921,  11737, 149738,
        15127, 102677,  52501, 177777,  40286,   8639,  11990,  14753,
       216026, 240522, 223596, 222509,  18397,  76763,   8556,  13420,
       237240, 142220,   8757,  16565,  86813,  14710,  16091,  86628])

In [804]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [805]:
subset_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,194.47972,601.49387,337.78535,29.16805,150.06314,2.43257,37.53171,71.26957,3.34659,60.95864,17.4506,608.96086,36.69111,15.08782,914.33674,7.99216,0.41397,1423.86849,0.68791,14.07225
std,213.33364,116.43957,12.79348,22.88873,66.48725,2.25996,1.4215,62.37675,1.83084,27.67582,8.99566,202.87367,14.00937,4.68803,573.48789,17.13806,0.41369,1996.0442,0.30829,17.56565
min,19.9788,390.7198,315.5125,1.27936,41.7375,0.05625,35.05695,6.08,0.52164,13.02275,2.13488,287.9466,10.86362,5.39091,46.13375,0.04464,0.02608,0.5625,0.10281,0.0225
25%,52.60393,510.59465,326.04705,10.45404,112.02998,0.90838,36.22745,19.68019,2.21709,45.24325,11.43623,463.71855,25.28096,12.19102,444.32873,1.7123,0.15462,330.0625,0.50037,2.22598
50%,105.94745,579.1378,339.542,27.12899,144.70575,1.74797,37.72689,61.49586,3.11904,56.69952,16.69467,571.8311,36.28499,13.76573,859.22625,3.07298,0.28202,843.58875,0.6297,7.31479
75%,253.69303,678.84852,348.27902,36.03892,179.11353,3.00643,38.69768,106.3195,4.10491,73.77128,21.56779,758.52978,44.61936,18.97431,1207.45275,6.30719,0.45723,1388.06,0.86224,23.1403
max,916.5097,905.355,356.9908,107.0733,402.3,8.19218,39.66564,265.0017,10.33637,148.3127,47.57578,1224.399,71.14468,23.67693,2757.834,104.8181,1.87098,10316.97,1.57042,74.77966


In [806]:
subset_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,195.52469,450.54937,209.88841,30.74838,99.87757,2.49977,23.32093,66.09608,2.95025,54.97015,13.18617,518.5986,28.91198,10.13333,849.94854,6.94168,0.28208,1467.9428,0.49493,12.07557
std,178.75955,207.11369,142.26635,23.08855,55.73035,2.44707,15.80737,70.35193,1.70641,33.94836,7.43658,316.73942,12.44752,8.94448,591.67582,11.69328,0.28046,3016.64498,0.31869,14.37588
min,11.51977,179.9448,15.47674,2.309,15.76667,0.02917,1.71964,3.06332,0.44957,16.60563,1.83924,165.8669,8.88312,0.41385,48.28076,0.53162,0.00254,1.525,0.09412,0.061
25%,52.73812,264.57755,101.05217,11.46596,64.63125,0.91358,11.22802,18.61364,1.627,33.04839,6.90594,332.5021,21.31413,2.80919,399.14718,2.15192,0.09614,362.35582,0.25505,3.67745
50%,117.12005,434.04795,168.3798,24.72044,78.99882,1.81291,18.70887,34.61906,2.63547,45.39443,12.82867,417.18225,27.03035,7.71233,774.1739,3.97073,0.157,689.34775,0.43285,7.62298
75%,322.86605,621.1571,294.41987,49.87391,135.51625,3.11939,32.71331,106.00048,3.90386,63.81725,16.90159,540.35833,33.35592,13.28235,1124.38225,6.48547,0.3572,1055.53925,0.66613,11.86989
max,571.601,1081.281,590.675,94.90217,289.5188,10.19315,65.63055,362.7391,7.47058,177.7982,32.7074,1722.696,73.38993,37.4432,2854.161,72.41016,1.0548,17957.94,1.74218,57.76982


In [807]:
user_recipes_fat = recipe_db[recipe_db.index.isin(subset_fat_id)]
user_recipes_normal = recipe_db[recipe_db.index.isin(subset_normal_id)]

In [1169]:
####temp#######
temp_fat_20 = [16966,  14753,  62459,  18397,   8757,  52501,   8556, 240522,
        86628, 221304, 223596,  51653,  48921, 142220,  76763, 222509,
        17496,  14710,  86860,  86813]
####temp#######
user_recipes_fat = recipe_db[recipe_db.index.isin(temp_fat_20)]


In [1124]:
user_recipes_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14710,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [808]:
#top10 most common ingredients
#16421    2125 salt
#4342     1506 garlic
#4397     1412 onion
#16406    1185 ground black pepper
#16157    1016 butter
#6307      944 olive oil
#6494      786 skinless bone less chicken breast
#2496      749 water
#16238     574 grated Parmesan cheese
#16317     538 eggs


drop_id_list = [16421, 4342, 4397, 16406, 16157, 6307, 6494, 2496, 16238, 16317]





In [809]:
subset_fat_id

array([ 17496, 221304,  62459,  86860, 214478, 125921,   8613,  21352,
        51653, 164208,   8679,  16563,  16966,  48921,  11737, 149738,
        15127, 102677,  52501, 177777,  40286,   8639,  11990,  14753,
       216026, 240522, 223596, 222509,  18397,  76763,   8556,  13420,
       237240, 142220,   8757,  16565,  86813,  14710,  16091,  86628])

In [810]:
subset_normal_id

array([ 43272,  30794, 220214,  14622, 140135,  20618,  19335,  15378,
        88495,  21557,   8720,  18416,  16663,  15225,  88086, 202951,
       166506,  16741,   8994,   8949,  45580,  11720, 109782, 199382,
         8549,  79301, 217899,  59988,  36994,  24509, 150251,  22364,
        21140, 230735,  16650,  17869,  24080,  24368,  52608,  27946])

In [1170]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)

#original
#new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=subset_fat_id)
#original


####temp#######
new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=temp_fat_20)
####temp#######
new_recipe_db_wo_userrecipes_normal = new_recipe_db.drop(axis=0, labels=subset_normal_id)

In [1171]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes_fat = user_recipes_fat.drop(axis=1, labels=drop_id_list)
new_user_recipes_normal = user_recipes_normal.drop(axis=1, labels=drop_id_list)

In [1172]:
sample_fat = new_user_recipes_fat.sample(n=20).copy()


In [None]:
sample_normal = new_user_recipes_normal.sample(n=40).copy()

In [1173]:
sample_normal
sample_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
220214,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24509,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150251,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202951,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52608,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
140135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14710,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223596,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
221304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1174]:
## Jaccard - fat

result_array = cdist(sample_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=sample_fat.index.values)
# result_2

result_w_filter_10_fat = pd.DataFrame(result_w_filter_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_fat = result_w_filter_10_fat.sort_values(by='jaccard_distance_sum')
result_w_filter_10_fat['jaccard_distance_sum'] = result_w_filter_10_fat['jaccard_distance_sum'].div(20)
result_w_filter_10_fat[0:10]


Unnamed: 0,jaccard_distance_sum
8547,0.92198
188706,0.93331
70513,0.94038
141125,0.94441
62256,0.94456
231808,0.94471
90160,0.94768
240773,0.94804
18442,0.94897
14614,0.94936


In [1175]:
## Jaccard - normal

result_array = cdist(sample_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=sample_normal.index.values)
# result_2

result_w_filter_10_normal = pd.DataFrame(result_w_filter_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_normal = result_w_filter_10_normal.sort_values(by='jaccard_distance_sum')
result_w_filter_10_normal['jaccard_distance_sum'] = result_w_filter_10_normal['jaccard_distance_sum'].div(20)
result_w_filter_10_normal[0:10]


Unnamed: 0,jaccard_distance_sum
24682,1.8672
31065,1.8774
86047,1.87842
15196,1.88124
159972,1.8844
11937,1.88443
9005,1.88669
30559,1.8867
20456,1.88828
231396,1.8893


In [1176]:
# userrecipes fat
sample_fat.index.to_numpy()

array([ 48921,  18397,  76763,  14710,  17496,  51653,  52501, 223596,
       221304,  86813,  62459,  86628,  16966,   8556,  86860, 142220,
        14753,   8757, 240522, 222509])

In [1177]:
# userrecipes normal
sample_normal.index.to_numpy()


array([220214,  24509,  45580, 150251,  18416, 202951,   8994,  52608,
       140135,  24368,  20618,  14622,  22364, 217899, 166506, 199382,
        17869,  36994,  15225,  59988, 230735,  88495,  16650,   8949,
        21557, 109782,  24080,  11720,  21140,  27946,  15378,   8720,
        30794,  79301,   8549,  16663,  16741,  43272,  88086,  19335])

In [1178]:
result_w_filter_10_fat[0:10].index.to_numpy()

array([  8547, 188706,  70513, 141125,  62256, 231808,  90160, 240773,
        18442,  14614])

In [1179]:
result_w_filter_10_normal[0:10].index.to_numpy()

array([ 24682,  31065,  86047,  15196, 159972,  11937,   9005,  30559,
        20456, 231396])

In [1180]:
result_w_filter_fat_short = result_w_filter_fat.copy()
result_w_filter_normal_short = result_w_filter_normal.copy()

In [1181]:
# top 10 fat short
fat_short = pd.DataFrame(np.sort(result_w_filter_fat_short.values, axis=0), index=result_w_filter_fat_short.index, columns=result_w_filter_fat_short.columns)

new_columns = fat_short.columns[fat_short.loc[fat_short.first_valid_index()].argsort()]
result_fat_short = fat_short[new_columns]
result_fat_short = result_fat_short.reset_index()
result_fat_short = result_fat_short.loc[0].to_frame()
result_fat_short[0:11].index.to_numpy()


array(['index', 8547, 188706, 34361, 206120, 220716, 16372, 14525, 235948,
       11678, 25884], dtype=object)

In [1182]:
fat_short[new_columns]

Unnamed: 0,8547,188706,34361,206120,220716,16372,14525,235948,11678,25884,...,24038,8933,17815,8938,13952,223218,232907,24160,87053,193219
48921,0.0,0.25,0.33333,0.33333,0.33333,0.46154,0.5,0.5,0.5,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
18397,0.9,0.81818,0.875,0.875,0.875,0.8,0.91667,0.90909,0.88889,0.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
76763,0.90909,0.875,1.0,1.0,1.0,0.88235,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
14710,0.90909,0.9,1.0,1.0,1.0,0.95238,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
17496,0.92308,0.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
51653,0.92857,0.92308,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
52501,0.92857,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
223596,0.94118,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
221304,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
86813,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [1183]:
sample_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14710,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223596,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
221304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1184]:
# top 10 normal short
normal_short = pd.DataFrame(np.sort(result_w_filter_normal_short.values, axis=0), index=result_w_filter_normal_short.index, columns=result_w_filter_normal_short.columns)

new_columns = normal_short.columns[normal_short.loc[normal_short.first_valid_index()].argsort()]
result_normal_short = normal_short[new_columns]
result_normal_short = result_normal_short.reset_index()
result_normal_short = result_normal_short.loc[0].to_frame()
result_normal_short[0:11].index.to_numpy()


array(['index', 15502, 72007, 45361, 14773, 218070, 19400, 139948, 82693,
       178809, 86047], dtype=object)

In [1185]:
normal_short[new_columns]

Unnamed: 0,15502,72007,45361,14773,218070,19400,139948,82693,178809,86047,...,19582,239137,92528,19494,19423,94113,18377,13464,97646,56785
220214,0.0,0.375,0.4,0.4,0.42857,0.5,0.5,0.5,0.5,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
24509,0.8,0.83333,0.66667,0.875,0.81818,0.90909,0.83333,0.5,0.8,0.75,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
45580,0.85714,0.84615,0.75,0.875,0.85714,0.92857,0.875,0.66667,0.84615,0.75,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
150251,0.91667,0.86667,0.83333,0.88889,0.88889,0.92857,0.875,0.75,0.84615,0.77778,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
18416,0.91667,0.875,0.875,0.9,0.88889,0.9375,0.88889,0.83333,0.91667,0.81818,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
202951,0.92308,0.88889,0.88889,1.0,0.9,1.0,0.88889,0.85714,0.91667,0.84615,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8994,0.93333,0.9,0.88889,1.0,0.90909,1.0,0.90909,1.0,0.91667,0.875,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
52608,0.93333,0.90909,0.9,1.0,0.91667,1.0,1.0,1.0,0.92308,0.90909,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
140135,0.9375,0.90909,0.92308,1.0,0.93333,1.0,1.0,1.0,0.92857,0.90909,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
24368,0.94118,0.91667,1.0,1.0,1.0,1.0,1.0,1.0,0.92857,0.90909,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [1186]:
sample_fat.index.to_numpy()

array([ 48921,  18397,  76763,  14710,  17496,  51653,  52501, 223596,
       221304,  86813,  62459,  86628,  16966,   8556,  86860, 142220,
        14753,   8757, 240522, 222509])

In [1187]:
sample_normal.index.to_numpy()

array([220214,  24509,  45580, 150251,  18416, 202951,   8994,  52608,
       140135,  24368,  20618,  14622,  22364, 217899, 166506, 199382,
        17869,  36994,  15225,  59988, 230735,  88495,  16650,   8949,
        21557, 109782,  24080,  11720,  21140,  27946,  15378,   8720,
        30794,  79301,   8549,  16663,  16741,  43272,  88086,  19335])

In [1188]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_fat = sample_fat.copy()
user_one_vector_fat.loc['sum',:]= user_one_vector_fat.sum(axis=0)
user_one_vector_fat = user_one_vector_fat.drop(axis=0, labels=sample_fat.index)
user_one_vector_fat[user_one_vector_fat > 0] = 1

In [1189]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_normal = sample_normal.copy()
user_one_vector_normal.loc['sum',:]= user_one_vector_normal.sum(axis=0)
user_one_vector_normal = user_one_vector_normal.drop(axis=0, labels=sample_normal.index)
user_one_vector_normal[user_one_vector_normal > 0] = 1

In [1190]:
user_one_vector_normal
user_one_vector_normal.apply(pd.value_counts).count(axis=1)

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0.00000    562
1.00000    158
dtype: int64

In [1191]:
user_one_vector_fat
user_one_vector_fat.apply(pd.value_counts).count(axis=1)

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0.00000    626
1.00000     94
dtype: int64

In [1192]:
## Jaccard - Vector fat top 10

result_array = cdist(user_one_vector_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_vector_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=user_one_vector_fat.index.values)


result_w_filter_vector_10_fat = pd.DataFrame(result_w_filter_vector_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_fat = result_w_filter_vector_10_fat.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_fat[0:10]


Unnamed: 0,jaccard_distance_sum
51850,0.88889
231396,0.89796
135885,0.9
52299,0.9
145843,0.90426
16372,0.90526
83083,0.90909
180905,0.90909
24682,0.91489
70012,0.91579


In [1193]:
## Jaccard - Vector normal top 10

result_array = cdist(user_one_vector_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_vector_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=user_one_vector_normal.index.values)


result_w_filter_vector_10_normal = pd.DataFrame(result_w_filter_vector_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_normal = result_w_filter_vector_10_normal.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_normal[0:10]


Unnamed: 0,jaccard_distance_sum
11921,0.92405
9005,0.92453
231396,0.925
51850,0.92593
239867,0.92683
60598,0.93125
143105,0.93125
8550,0.93168
111905,0.9321
52299,0.93252


In [1194]:
result_w_filter_vector_10_normal[0:10].index.to_numpy()

array([ 11921,   9005, 231396,  51850, 239867,  60598, 143105,   8550,
       111905,  52299])

In [1195]:
####### results

In [1196]:
# userrecipes fat
sample_fat.index.to_numpy()

array([ 48921,  18397,  76763,  14710,  17496,  51653,  52501, 223596,
       221304,  86813,  62459,  86628,  16966,   8556,  86860, 142220,
        14753,   8757, 240522, 222509])

In [1197]:
###### jaccard fat top 10 naiv
result_w_filter_10_fat[0:10].index.to_numpy()

array([  8547, 188706,  70513, 141125,  62256, 231808,  90160, 240773,
        18442,  14614])

In [1198]:
###### jaccard kurz fat top 10
result_fat_short[0:11].index.to_numpy()


array(['index', 8547, 188706, 34361, 206120, 220716, 16372, 14525, 235948,
       11678, 25884], dtype=object)

In [1199]:
###### jaccard vector fat top 10
result_w_filter_vector_10_fat[0:10].index.to_numpy()

array([ 51850, 231396, 135885,  52299, 145843,  16372,  83083, 180905,
        24682,  70012])

In [1200]:
# userrecipes normal
sample_normal.index.to_numpy()

array([220214,  24509,  45580, 150251,  18416, 202951,   8994,  52608,
       140135,  24368,  20618,  14622,  22364, 217899, 166506, 199382,
        17869,  36994,  15225,  59988, 230735,  88495,  16650,   8949,
        21557, 109782,  24080,  11720,  21140,  27946,  15378,   8720,
        30794,  79301,   8549,  16663,  16741,  43272,  88086,  19335])

In [1201]:
###### jaccard normal top 10 naiv
result_w_filter_10_normal[0:10].index.to_numpy()

array([ 24682,  31065,  86047,  15196, 159972,  11937,   9005,  30559,
        20456, 231396])

In [1202]:
###### jaccard kurz normal top 10
result_normal_short[0:11].index.to_numpy()


array(['index', 15502, 72007, 45361, 14773, 218070, 19400, 139948, 82693,
       178809, 86047], dtype=object)

In [1203]:
###### jaccard vector normal top 10
result_w_filter_vector_10_normal[0:10].index.to_numpy()

array([ 11921,   9005, 231396,  51850, 239867,  60598, 143105,   8550,
       111905,  52299])

In [1204]:
##### random top 10
subset_random_top10.index.to_numpy()

array([232097,  22943, 212882, 222249,  43024,   8606, 219919,  72767,
       143375,  47667])

In [1216]:
result_w_filter_fat_50 = result_w_filter_10_fat[0:50].copy()
result_w_filter_fat_50['pos'] = np.arange(len(result_w_filter_fat_50))
result_w_filter_fat_50 = result_w_filter_fat_50.reset_index()
result_w_filter_fat_50 = result_w_filter_fat_50.set_index('index')

In [1236]:
result_w_filter_fat_50 = result_w_filter_vector_10_fat[0:50].copy()
result_w_filter_fat_50['pos'] = np.arange(len(result_w_filter_fat_50))
result_w_filter_fat_50 = result_w_filter_fat_50.reset_index()
result_w_filter_fat_50 = result_w_filter_fat_50.set_index('index')

In [1237]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_fat_50.index.get_level_values('index').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db3 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db3.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db3.columns = nutrition_db3.columns.droplevel(0)

# entferne alle NA
nutrition_db3 = nutrition_db3.dropna()

In [1238]:
# 19–30 empfohlene nährwerte pro tag #https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
recommenden_nut_low_fat = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)
recommenden_nut_low_fat.loc['index'] = [
    # "Calcium":[1],
    333.3,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    43.3,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    8.3,
    # "Fat":[1],
    1,
    # "Folate":[1],
    133.3,
    # "Iron":[1],
    2.6,
    # "Magnesium":[1],
    133.3,
    # "Niacin Equivalents":[1],
    5.3,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    0,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    0,
    # "Thiamin":[1],
    0.4,
    # "Vitamin A - IU":[1],
    300,
    # "Vitamin B6":[1],
    0.43,
    # "Vitamin C":[1]}
    30]


In [1239]:
no_weight = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_fat = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    10000000,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_carbs = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1000000,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

In [1240]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_fat, nutrition_db3, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db3.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat = euclid_distance_sum_fat.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat[0:10]

Unnamed: 0,euclid_distance_sum
51039,11530.32013
22331,23518.12165
16330,26524.87854
51850,26566.83111
111905,28943.41405
22751,32658.76877
73964,39506.76667
218720,42338.13198
86515,45571.94446
72804,48045.81138


In [1241]:
jaccard_euclid_joined_fat = pd.merge(euclid_distance_sum_fat, result_w_filter_fat_50, left_index=True, right_index=True)

In [1242]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined_fat[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_fat.columns = nutrition_top10_fat.columns.droplevel(0)

# entferne alle NA
nutrition_top10_fat = nutrition_top10_fat.dropna()

In [1243]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(user_recipes_fat.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_user_recipes_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_user_recipes_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_user_recipes_fat.columns = nutrition_user_recipes_fat.columns.droplevel(0)

# entferne alle NA
nutrition_user_recipes_fat = nutrition_user_recipes_fat.dropna()

In [1244]:
nutrition_user_recipes_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,144.61405,563.83977,334.64292,24.81982,148.59054,2.55403,37.18255,66.661,3.07888,62.11656,15.13814,623.81233,32.44829,15.10086,831.17217,5.65421,0.4484,1688.64223,0.62842,13.15263
std,126.89385,92.24917,14.74709,16.54758,73.82278,2.18785,1.63857,63.02347,1.33454,32.62118,5.77701,236.43041,10.22158,5.02136,509.40516,6.72385,0.46234,2520.36637,0.23017,17.42727
min,22.91706,390.7198,315.5125,1.27936,41.7375,0.09962,35.05695,6.08,0.58999,21.99075,7.48167,287.9466,14.9787,5.39091,46.13375,0.04464,0.096,92.58968,0.21252,0.87
25%,57.0225,500.82482,321.0012,13.55052,112.02998,1.2925,35.6668,23.66486,2.2158,44.04216,10.62018,450.36312,24.81829,12.2637,468.4517,1.8971,0.1503,253.70312,0.52699,2.74889
50%,112.8676,549.52315,335.2659,24.81457,137.62725,1.80051,37.25176,52.50934,3.00261,54.51962,13.48708,576.30155,32.584,14.09328,740.3836,2.21235,0.27884,806.5868,0.58379,8.96771
75%,159.3108,606.45907,347.85147,30.89026,167.07108,3.00643,38.65017,69.04147,4.16262,70.92839,20.04293,761.43085,38.42296,18.68514,1061.28575,7.69758,0.55754,1667.73675,0.73867,12.07114
max,448.9153,754.5421,356.9908,68.65463,402.3,8.02383,39.66564,265.0017,5.45481,148.3127,27.61897,1224.399,51.97705,22.90133,2130.983,22.38983,1.87098,10316.97,1.18235,74.77966


In [1245]:
jaccard_euclid_joined_fat[0:10]

Unnamed: 0,euclid_distance_sum,jaccard_distance_sum,pos
51039,11530.32013,0.91837,18
22331,23518.12165,0.91919,23
16330,26524.87854,0.92632,31
51850,26566.83111,0.88889,0
111905,28943.41405,0.92079,26
22751,32658.76877,0.92632,29
73964,39506.76667,0.92708,40
218720,42338.13198,0.92708,33
86515,45571.94446,0.92079,25
72804,48045.81138,0.91837,19


In [1246]:
jaccard_euclid_joined_fat[0:10].index.to_numpy()

array([ 51039,  22331,  16330,  51850, 111905,  22751,  73964, 218720,
        86515,  72804])

In [1247]:
nutrition_top10_fat.index.to_numpy()

array([ 16330,  22331,  22751,  51039,  51850,  72804,  73964,  86515,
       111905, 218720])

In [1248]:
nutrition_top10_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,131.75451,302.88898,101.32971,28.40184,56.39162,2.85592,11.25886,62.2761,2.64841,45.69298,11.04583,494.85227,21.90511,3.45302,837.7216,11.36681,0.21913,1568.89711,0.46788,26.84125
std,109.05386,81.02762,32.49154,18.79912,18.25881,1.38868,3.61017,44.92809,0.60092,14.83878,5.73558,172.19809,6.17445,1.98677,554.44844,16.14545,0.14238,993.47436,0.23908,27.62678
min,34.35106,194.3627,40.95089,10.2685,15.40167,1.28899,4.5501,7.59145,1.8926,26.74829,6.38666,233.5959,13.60084,0.8855,251.829,1.75411,0.07396,50.63841,0.17184,4.29336
25%,42.49552,239.55507,84.41155,14.82348,48.027,1.79375,9.37906,36.9342,2.08343,34.2893,7.15479,373.82885,16.90826,1.8852,565.15812,2.96123,0.10868,1044.31535,0.32859,10.89144
50%,99.62789,299.0835,96.45348,23.17948,61.29291,2.53357,10.71705,53.48394,2.73235,43.40824,7.78749,482.7032,21.52152,3.14365,640.3308,5.83783,0.18512,1469.3995,0.44899,14.63942
75%,170.74938,340.16745,127.43638,30.44074,68.13024,3.49309,14.1596,81.36064,2.96001,57.63681,14.93306,626.79703,25.15311,4.8145,768.32225,11.96604,0.26448,1843.4485,0.53621,36.55744
max,348.3654,442.1721,145.6328,67.86639,76.508,5.93529,16.18143,155.8812,3.61423,71.90273,23.41771,746.4229,34.32086,6.58164,1930.57,55.54696,0.513,3368.707,1.00677,96.19368


In [1249]:
nutrition_db2.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0,3769.0
mean,147.65186,437.44356,206.11989,29.13837,97.10705,2.59745,22.90221,60.52532,3.15292,51.41953,12.44388,532.06962,28.05446,8.74674,851.15449,6.7255,0.30428,1169.01319,0.50992,16.28996
std,160.00437,183.08939,125.84039,22.82783,60.47445,2.51004,13.98227,59.87693,2.44362,29.11251,6.8939,267.49221,12.01243,6.80052,643.6215,8.81425,0.30165,2322.11124,0.31208,24.36018
min,2.247,51.34326,1.323,0.0,0.0,0.0,0.147,0.0,0.19784,0.99745,0.03511,21.35641,0.7505,0.01992,5.20206,0.0,0.00055,0.0,0.00497,0.0
25%,41.91497,302.9718,116.5374,10.22622,60.372,0.74534,12.9486,15.88967,1.81434,33.22178,7.40249,345.4748,19.97439,3.62848,418.6246,1.76465,0.10514,236.8913,0.2845,2.15387
50%,83.1685,407.2476,182.7628,24.77198,85.12875,1.94552,20.30698,38.70226,2.76635,45.93,11.16012,480.7832,26.86635,7.16552,734.007,3.90585,0.19772,581.4108,0.4463,6.66049
75%,193.6033,541.5573,271.8885,43.2574,126.737,3.66472,30.20983,88.10015,3.93263,63.26534,16.56103,654.2805,34.32086,12.13782,1093.027,7.83809,0.39695,1113.579,0.68318,19.57264
max,1264.326,1828.192,989.103,236.7205,708.85,27.83175,109.9003,519.5359,55.89075,590.3922,47.57578,1931.488,99.72639,59.31002,7322.289,104.8181,2.35457,38664.7,3.83736,270.4131
