In [None]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [None]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [None]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [None]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [None]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [None]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [None]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [None]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [None]:
# random
subset_random_top10 = nutrition_db2.sample(n=10).copy()

In [None]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=40).copy()
subset_fat_id = subset_fat.index.to_numpy()


In [None]:
subset_normal = nutrition_db2.sample(n=40).copy()
subset_normal_id = subset_normal.index.to_numpy()

In [None]:
subset_normal.index.to_numpy()

In [None]:
subset_fat.index.to_numpy()

In [None]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [None]:
subset_fat.describe()

In [None]:
subset_normal.describe()

In [None]:
user_recipes_fat = recipe_db[recipe_db.index.isin(subset_fat_id)]
user_recipes_normal = recipe_db[recipe_db.index.isin(subset_normal_id)]

In [None]:
####temp#######
temp_fat_20 = [16966,  14753,  62459,  18397,   8757,  52501,   8556, 240522,
        86628, 221304, 223596,  51653,  48921, 142220,  76763, 222509,
        17496,  14710,  86860,  86813]
####temp#######
user_recipes_fat = recipe_db[recipe_db.index.isin(temp_fat_20)]


In [None]:
user_recipes_fat

In [None]:
#top10 most common ingredients
#16421    2125 salt
#4342     1506 garlic
#4397     1412 onion
#16406    1185 ground black pepper
#16157    1016 butter
#6307      944 olive oil
#6494      786 skinless bone less chicken breast
#2496      749 water
#16238     574 grated Parmesan cheese
#16317     538 eggs


drop_id_list = [16421, 4342, 4397, 16406, 16157, 6307, 6494, 2496, 16238, 16317]





In [None]:
subset_fat_id

In [None]:
subset_normal_id

In [None]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)

#original
#new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=subset_fat_id)
#original


####temp#######
new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=temp_fat_20)
####temp#######
new_recipe_db_wo_userrecipes_normal = new_recipe_db.drop(axis=0, labels=subset_normal_id)

In [None]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes_fat = user_recipes_fat.drop(axis=1, labels=drop_id_list)
new_user_recipes_normal = user_recipes_normal.drop(axis=1, labels=drop_id_list)

In [None]:
sample_fat = new_user_recipes_fat.sample(n=20).copy()


In [None]:
sample_normal = new_user_recipes_normal.sample(n=40).copy()

In [None]:
sample_normal
sample_fat

In [None]:
## Jaccard - fat

result_array = cdist(sample_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=sample_fat.index.values)
# result_2

result_w_filter_10_fat = pd.DataFrame(result_w_filter_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_fat = result_w_filter_10_fat.sort_values(by='jaccard_distance_sum')
result_w_filter_10_fat['jaccard_distance_sum'] = result_w_filter_10_fat['jaccard_distance_sum'].div(20)
result_w_filter_10_fat[0:10]


In [None]:
## Jaccard - normal

result_array = cdist(sample_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=sample_normal.index.values)
# result_2

result_w_filter_10_normal = pd.DataFrame(result_w_filter_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_normal = result_w_filter_10_normal.sort_values(by='jaccard_distance_sum')
result_w_filter_10_normal['jaccard_distance_sum'] = result_w_filter_10_normal['jaccard_distance_sum'].div(20)
result_w_filter_10_normal[0:10]


In [None]:
# userrecipes fat
sample_fat.index.to_numpy()

In [None]:
# userrecipes normal
sample_normal.index.to_numpy()


In [None]:
result_w_filter_10_fat[0:10].index.to_numpy()

In [None]:
result_w_filter_10_normal[0:10].index.to_numpy()

In [None]:
result_w_filter_fat_short = result_w_filter_fat.copy()
result_w_filter_normal_short = result_w_filter_normal.copy()

In [None]:
# top 10 fat short
fat_short = pd.DataFrame(np.sort(result_w_filter_fat_short.values, axis=0), index=result_w_filter_fat_short.index, columns=result_w_filter_fat_short.columns)

new_columns = fat_short.columns[fat_short.loc[fat_short.first_valid_index()].argsort()]
result_fat_short = fat_short[new_columns]
result_fat_short = result_fat_short.reset_index()
result_fat_short = result_fat_short.loc[1].to_frame()

result_fat_short[0:11].index.to_numpy()


In [None]:
fat_short[new_columns]

In [None]:
sample_fat

In [None]:
# top 10 normal short
normal_short = pd.DataFrame(np.sort(result_w_filter_normal_short.values, axis=0), index=result_w_filter_normal_short.index, columns=result_w_filter_normal_short.columns)

new_columns = normal_short.columns[normal_short.loc[normal_short.first_valid_index()].argsort()]
result_normal_short = normal_short[new_columns]
result_normal_short = result_normal_short.reset_index()
result_normal_short = result_normal_short.loc[0].to_frame()
result_normal_short[0:11].index.to_numpy()


In [None]:
normal_short[new_columns]

In [None]:
sample_fat.index.to_numpy()

In [None]:
sample_normal.index.to_numpy()

In [None]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_fat = sample_fat.copy()
user_one_vector_fat.loc['sum',:]= user_one_vector_fat.sum(axis=0)
user_one_vector_fat = user_one_vector_fat.drop(axis=0, labels=sample_fat.index)
user_one_vector_fat[user_one_vector_fat > 0] = 1

In [None]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_normal = sample_normal.copy()
user_one_vector_normal.loc['sum',:]= user_one_vector_normal.sum(axis=0)
user_one_vector_normal = user_one_vector_normal.drop(axis=0, labels=sample_normal.index)
user_one_vector_normal[user_one_vector_normal > 0] = 1

In [None]:
user_one_vector_normal
user_one_vector_normal.apply(pd.value_counts).count(axis=1)

In [None]:
user_one_vector_fat
user_one_vector_fat.apply(pd.value_counts).count(axis=1)

In [None]:
## Jaccard - Vector fat top 10

result_array = cdist(user_one_vector_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_vector_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=user_one_vector_fat.index.values)


result_w_filter_vector_10_fat = pd.DataFrame(result_w_filter_vector_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_fat = result_w_filter_vector_10_fat.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_fat[0:10]


In [None]:
## Jaccard - Vector normal top 10

result_array = cdist(user_one_vector_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_vector_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=user_one_vector_normal.index.values)


result_w_filter_vector_10_normal = pd.DataFrame(result_w_filter_vector_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_normal = result_w_filter_vector_10_normal.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_normal[0:10]


In [None]:
result_w_filter_vector_10_normal[0:10].index.to_numpy()

In [None]:
####### results

In [None]:
# userrecipes fat
sample_fat.index.to_numpy()

In [None]:
###### jaccard fat top 10 naiv
result_w_filter_10_fat[0:10].index.to_numpy()

In [None]:
###### jaccard kurz fat top 10
result_fat_short[1:11].index.to_numpy()


In [None]:
###### jaccard vector fat top 10
result_w_filter_vector_10_fat[0:10].index.to_numpy()

In [None]:
# userrecipes normal
sample_normal.index.to_numpy()

In [None]:
###### jaccard normal top 10 naiv
result_w_filter_10_normal[0:10].index.to_numpy()

In [None]:
###### jaccard kurz normal top 10
result_normal_short[1:11].index.to_numpy()


In [None]:
###### jaccard vector normal top 10
result_w_filter_vector_10_normal[0:10].index.to_numpy()

In [None]:
##### random top 10
subset_random_top10.index.to_numpy()

In [None]:
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t1/?report=objectonly
#https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t5/?report=objectonly
#männer alter 31 - 50 pro tag geteil durch 3 mahlzeiten

recommenden_nut_low_fat = pd.DataFrame(data=None, columns=nutrition_db2.columns, index=None)


recommenden_nut_low_fat.loc['index'] = [
    # "Calcium":[1],
    266.67,
    # "Calories":[1],
    500,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    43.33,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    12.67,
    # "Fat":[1],16.67,
    10,
    # "Folate":[1],
    106.67, 
    # "Iron":[1],
    2,
    # "Magnesium":[1],
    116.67, 
    # "Niacin Equivalents":[1],
    4,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    0.3, 
    # "Vitamin A - IU":[1],
    208, 
    # "Vitamin B6":[1],
    0.36, 
    # "Vitamin C":[1]}
    25] 

In [None]:
no_weight = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_fat = [
    # "Calcium":[1],
    1000,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    100000,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    100,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

weighted_carbs = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1000000,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

In [None]:
result_w_filter_fat_100_vector = result_w_filter_vector_10_fat[0:100].copy()
result_w_filter_fat_100_vector['pos'] = np.arange(len(result_w_filter_fat_100_vector))
result_w_filter_fat_100_vector = result_w_filter_fat_100_vector.reset_index()
result_w_filter_fat_100_vector = result_w_filter_fat_100_vector.set_index('index')

In [None]:
result_w_filter_fat_100 = result_w_filter_10_fat[0:100].copy()
result_w_filter_fat_100['pos'] = np.arange(len(result_w_filter_fat_100))
result_w_filter_fat_100 = result_w_filter_fat_100.reset_index()
result_w_filter_fat_100 = result_w_filter_fat_100.set_index('index')

In [None]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_fat_100.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_fat_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_fat_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_fat_naiv.columns = nutrition_fat_naiv.columns.droplevel(0)

# entferne alle NA
nutrition_fat_naiv = nutrition_fat_naiv.dropna()

In [None]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_fat, nutrition_fat_naiv, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_fat_naiv.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat_naiv = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat_naiv = euclid_distance_sum_fat_naiv.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat_naiv[0:10]

In [None]:
jaccard_euclid_joined_fat_naiv = pd.merge(euclid_distance_sum_fat_naiv, result_w_filter_fat_100, left_index=True, right_index=True)

In [None]:
jaccard_euclid_joined_fat_naiv[0:10]

In [None]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(jaccard_euclid_joined_fat_naiv[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_fat_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_fat_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_fat_naiv.columns = nutrition_top10_fat_naiv.columns.droplevel(0)

# entferne alle NA
nutrition_top10_fat_naiv = nutrition_top10_fat_naiv.dropna()

In [269]:
nutrition_top10_fat_naiv.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,212.15929,347.26522,140.72331,29.57564,75.88731,3.13171,15.63592,70.16376,3.28521,43.60824,9.33045,495.32276,21.98825,6.81862,726.05904,5.48924,0.26014,765.58346,0.36334,26.51701
std,82.36201,102.79235,25.5505,20.57749,22.26041,2.20438,2.83894,58.62786,1.81904,16.10101,3.46433,165.79725,5.96208,1.84304,269.68843,2.73489,0.18882,597.76996,0.15497,38.15255
min,115.4559,200.5096,96.83125,8.50724,53.31907,0.18479,10.75903,19.02514,1.61694,22.19329,6.48622,233.5959,11.25829,3.16377,135.44,1.53699,0.06435,173.6259,0.16549,0.24869
25%,169.32903,321.24892,128.54217,20.18204,56.19729,2.15752,14.28247,51.08488,2.39708,31.66494,7.25196,362.11325,19.86547,6.17503,583.50835,3.21599,0.12469,519.63588,0.2325,2.78399
50%,189.0762,343.44495,134.31735,24.76168,71.35308,2.62511,14.92415,56.93271,3.00156,43.27683,8.76698,520.37845,21.88125,6.95089,814.45765,5.27446,0.24354,603.88525,0.36174,7.66879
75%,250.67625,370.21308,154.89895,28.30338,94.22309,3.5301,17.211,64.72732,3.33173,47.02484,9.56176,624.49435,24.10793,8.07506,889.37787,7.74314,0.30506,745.44445,0.48236,28.78732
max,348.3654,587.8757,179.0071,75.33678,109.2033,7.9905,19.88968,232.0606,8.11861,74.57674,18.56014,707.6725,33.57063,9.21249,1034.152,9.49131,0.71992,2207.053,0.57304,106.7028


In [None]:
nutrition_top10_fat_naiv.index.to_numpy()

In [330]:
new_df = nutrition_top10_fat_naiv[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [332]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,347.26522,140.72331,29.57564,15.63592,6.81862,21.98825,212.15929,43.60824,3.28521
std,102.79235,25.5505,20.57749,2.83894,1.84304,5.96208,82.36201,16.10101,1.81904
min,200.5096,96.83125,8.50724,10.75903,3.16377,11.25829,115.4559,22.19329,1.61694
25%,321.24892,128.54217,20.18204,14.28247,6.17503,19.86547,169.32903,31.66494,2.39708
50%,343.44495,134.31735,24.76168,14.92415,6.95089,21.88125,189.0762,43.27683,3.00156
75%,370.21308,154.89895,28.30338,17.211,8.07506,24.10793,250.67625,47.02484,3.33173
max,587.8757,179.0071,75.33678,19.88968,9.21249,33.57063,348.3654,74.57674,8.11861


In [261]:
result_w_filter_fat_100_short = result_fat_short[1:101].index.to_numpy().copy()


In [264]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_fat_short[1:101].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_fat_short = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_fat_short.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_fat_short.columns = nutrition_fat_short.columns.droplevel(0)

# entferne alle NA
nutrition_fat_short = nutrition_fat_short.dropna()

In [266]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_fat, nutrition_fat_short, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_fat_short.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat_short = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat_short = euclid_distance_sum_fat_short.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat_short[0:10]

Unnamed: 0,euclid_distance_sum
235323,2963.70113
18442,3126.81472
8965,3288.17839
17869,3301.38331
52148,3392.22542
73964,3654.74357
21297,4087.73266
11916,4453.50712
34361,4911.82372
14724,4969.35463


In [267]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(euclid_distance_sum_fat_short[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_fat_short = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_fat_short.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_fat_short.columns = nutrition_top10_fat_short.columns.droplevel(0)

# entferne alle NA
nutrition_top10_fat_short = nutrition_top10_fat_short.dropna()

In [268]:
nutrition_top10_fat_short.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,246.13638,376.5342,135.33025,34.87812,69.31935,4.13922,15.03669,99.12528,3.21327,56.35856,11.34105,617.69192,24.47273,6.29289,797.67408,6.75959,0.39576,1104.28851,0.47839,41.1223
std,93.92264,117.27508,27.33583,19.38038,37.53288,2.64418,3.03731,64.34844,1.02763,12.21409,5.31644,196.4877,9.16575,1.35276,273.34081,3.73947,0.35304,1035.22989,0.22483,42.60456
min,120.13,236.8492,75.64122,10.2685,22.48,0.6075,8.40458,17.745,1.85349,39.66,6.53378,417.7853,11.25829,4.18225,135.44,3.40015,0.06435,90.8075,0.23307,0.061
25%,176.58055,316.09415,127.64125,23.62964,55.34107,2.66736,14.18237,59.8402,2.44834,45.85537,7.18329,487.32785,19.06985,5.22905,740.28135,4.23192,0.1666,646.7083,0.31175,7.41081
50%,241.66255,326.81905,133.37395,31.10388,65.785,3.5866,14.81932,79.54997,3.17201,60.56481,9.33337,563.69635,22.24039,6.67609,864.6123,5.647,0.2284,835.2098,0.42745,24.16721
75%,337.26127,416.24435,152.42108,40.58051,75.97485,4.21726,16.93567,131.91579,3.56851,64.26294,13.28304,704.4456,31.09789,7.12775,981.51135,7.74314,0.60273,1068.84575,0.55425,76.6328
max,352.9491,649.576,170.7775,75.25994,153.695,9.46078,18.97528,232.0606,5.33774,74.38944,21.90084,1074.648,40.23982,8.03736,1030.996,15.84195,1.16597,3895.998,0.88844,106.7028


In [277]:
nutrition_top10_fat_short.index.to_numpy()

array([  8965,  11916,  14724,  17869,  18442,  21297,  34361,  52148,
        73964, 235323])

In [271]:
result_w_filter_fat_100_vector = result_w_filter_vector_10_fat[0:100].copy()
result_w_filter_fat_100_vector['pos'] = np.arange(len(result_w_filter_fat_100_vector))
result_w_filter_fat_100_vector = result_w_filter_fat_100_vector.reset_index()
result_w_filter_fat_100_vector = result_w_filter_fat_100_vector.set_index('index')

In [272]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_fat_100_vector.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_fat_vector = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_fat_vector.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_fat_vector.columns = nutrition_fat_vector.columns.droplevel(0)

# entferne alle NA
nutrition_fat_vector = nutrition_fat_vector.dropna()

In [273]:
# euklidische distanz für nährwerte

result_array = cdist(recommenden_nut_low_fat, nutrition_fat_vector, 'minkowski', p=2, w=weighted_fat)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_fat_vector.index.values, index=recommenden_nut_low_fat.index.values)

euclid_distance_sum_fat_vector = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum_fat_vector = euclid_distance_sum_fat_vector.sort_values(by='euclid_distance_sum')
euclid_distance_sum_fat_vector[0:10]

Unnamed: 0,euclid_distance_sum
222635,2656.70711
22751,2884.62922
218720,3205.85598
73964,3654.74357
47764,3694.71307
125646,3990.04083
51850,4309.87153
213108,4646.40433
213742,4690.02647
86515,4755.09357


In [274]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(euclid_distance_sum_fat_vector[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_top10_fat_vector = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_top10_fat_vector.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_top10_fat_vector.columns = nutrition_top10_fat_vector.columns.droplevel(0)

# entferne alle NA
nutrition_top10_fat_vector = nutrition_top10_fat_vector.dropna()

In [275]:
nutrition_top10_fat_vector.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,220.52175,331.05631,132.62326,28.41211,63.10973,3.76223,14.73592,77.04844,3.3878,53.94445,8.25789,551.8501,21.78478,6.02939,635.90571,5.13299,0.23259,1543.57442,0.37947,27.14741
std,74.32726,59.43686,39.81566,15.33014,30.57142,2.63906,4.42396,38.64705,1.24772,21.86293,1.51348,147.3885,5.28264,2.71222,234.36754,2.16354,0.1082,1142.29737,0.11479,21.30309
min,141.8622,225.3194,69.70808,10.2685,13.125,1.54934,7.74534,30.26573,2.27139,26.74829,6.38666,233.5959,13.60084,1.43604,251.829,1.75411,0.09686,390.7916,0.17184,3.5675
25%,157.40613,322.22235,106.55367,16.8523,54.67246,2.15752,11.83929,51.66255,2.76399,45.23232,7.25584,502.0589,19.92655,4.59974,523.47958,3.64864,0.16514,657.50013,0.32859,6.43829
50%,205.8262,342.15345,133.92645,27.30739,69.31846,2.71036,14.88071,68.84002,3.00156,48.58591,7.63795,532.84835,22.02222,6.31056,623.1429,5.27446,0.22213,1021.4543,0.38966,32.3401
75%,264.83247,374.03538,155.57768,30.44074,75.77294,4.60132,17.28641,102.38766,3.52882,57.63681,9.00264,667.16242,24.22196,8.07046,834.67255,6.97998,0.28556,2611.8025,0.47795,39.1125
max,348.3654,406.0063,193.9669,54.42183,109.2033,10.10235,21.55188,155.8812,6.70979,110.1314,11.35271,742.503,31.03836,9.54182,939.8853,7.85019,0.40852,3368.707,0.49912,68.26264


In [276]:
nutrition_top10_fat_vector.index.to_numpy()

array([ 22751,  47764,  51850,  73964,  86515, 125646, 213108, 213742,
       218720, 222635])

In [302]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_10_fat.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

random_nut_fat_naiv = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

random_nut_fat_naiv.set_index('id', inplace=True)

# remove multiindex 'amount'
random_nut_fat_naiv.columns = random_nut_fat_naiv.columns.droplevel(0)

# entferne alle NA
random_nut_fat_naiv = random_nut_fat_naiv.dropna()


In [303]:
asd = random_nut_fat_naiv.sort_values(by='Fat')

In [305]:
asd[0:10].describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,38.70887,186.72419,8.5929,29.76577,38.54453,2.73469,0.95477,44.79653,2.01912,39.79316,6.3333,482.15195,14.43233,0.18719,699.62336,7.82159,0.19012,950.15627,0.41897,17.61181
std,13.91447,88.90485,3.00284,25.51754,54.26492,2.79798,0.33365,47.34177,1.29271,22.17036,3.59982,462.21356,10.36769,0.07805,425.08936,10.32595,0.22016,1004.53875,0.32838,20.78962
min,22.2465,67.54216,1.323,1.9255,0.0,0.26294,0.147,4.35167,0.35016,8.685,0.54284,152.8917,0.7505,0.01992,183.8796,0.58219,0.00985,7.38,0.06808,1.40368
25%,29.03908,144.54335,8.60387,12.03016,0.0,1.03581,0.95599,12.82424,1.0943,29.74379,3.95787,185.0815,7.18936,0.14782,384.3189,3.15031,0.04068,332.61125,0.18691,6.52157
50%,37.62199,165.2826,9.6192,24.0673,21.98067,1.64616,1.0688,25.03194,1.9947,36.51286,6.34558,313.9214,11.93949,0.19185,606.5583,3.95583,0.0793,665.54215,0.31446,9.33288
75%,46.88089,215.7368,9.96918,42.23227,57.18543,2.73951,1.10769,64.98426,2.25283,42.8675,8.36965,537.11617,19.40237,0.24751,882.4669,5.24654,0.23163,1124.8705,0.56084,20.23434
max,68.01366,397.88,11.42801,88.31117,172.575,8.96175,1.26978,134.8421,4.33315,91.76,12.94783,1587.047,31.80096,0.27537,1469.935,34.79835,0.59393,3392.832,1.1023,72.693


In [297]:
random_nut_fat = nutrition_db2.sort_values(by='Fat')

In [298]:
random_nut_fat[0:10].index.to_numpy()

array([ 14725,  53194, 223269, 216688,  23444,  12768,  50939,  99480,
        13963,  19478])

In [299]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(random_nut_fat[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

random_nut_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

random_nut_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
random_nut_fat.columns = random_nut_fat.columns.droplevel(0)

# entferne alle NA
random_nut_fat = random_nut_fat.dropna()

In [300]:
random_nut_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,38.70887,186.72419,8.5929,29.76577,38.54453,2.73469,0.95477,44.79653,2.01912,39.79316,6.3333,482.15195,14.43233,0.18719,699.62336,7.82159,0.19012,950.15627,0.41897,17.61181
std,13.91447,88.90485,3.00284,25.51754,54.26492,2.79798,0.33365,47.34177,1.29271,22.17036,3.59982,462.21356,10.36769,0.07805,425.08936,10.32595,0.22016,1004.53875,0.32838,20.78962
min,22.2465,67.54216,1.323,1.9255,0.0,0.26294,0.147,4.35167,0.35016,8.685,0.54284,152.8917,0.7505,0.01992,183.8796,0.58219,0.00985,7.38,0.06808,1.40368
25%,29.03908,144.54335,8.60387,12.03016,0.0,1.03581,0.95599,12.82424,1.0943,29.74379,3.95787,185.0815,7.18936,0.14782,384.3189,3.15031,0.04068,332.61125,0.18691,6.52157
50%,37.62199,165.2826,9.6192,24.0673,21.98067,1.64616,1.0688,25.03194,1.9947,36.51286,6.34558,313.9214,11.93949,0.19185,606.5583,3.95583,0.0793,665.54215,0.31446,9.33288
75%,46.88089,215.7368,9.96918,42.23227,57.18543,2.73951,1.10769,64.98426,2.25283,42.8675,8.36965,537.11617,19.40237,0.24751,882.4669,5.24654,0.23163,1124.8705,0.56084,20.23434
max,68.01366,397.88,11.42801,88.31117,172.575,8.96175,1.26978,134.8421,4.33315,91.76,12.94783,1587.047,31.80096,0.27537,1469.935,34.79835,0.59393,3392.832,1.1023,72.693


In [None]:
asd[0:50]

In [None]:
asd[0:10].index.to_numpy()

In [312]:
nutrition_fat_naiv.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,193.58762,542.33307,280.92923,35.24085,126.84912,3.11108,31.21436,82.54184,3.74909,51.49306,12.97124,588.9095,30.01276,13.91145,878.25259,5.90811,0.34132,1067.70516,0.46588,20.47902
std,152.15803,233.04451,142.35312,28.64292,53.88096,3.21868,15.81701,66.17241,1.6952,21.31506,5.61848,280.87722,11.85509,8.19141,536.23257,4.45822,0.29503,1583.40654,0.23021,29.69788
min,21.16217,178.0237,81.26787,3.96258,42.622,0.0525,9.02976,5.91633,0.63185,14.54606,3.28438,97.71526,7.24539,2.95201,90.84255,0.35572,0.02349,24.77542,0.15045,0.05513
25%,86.60862,347.4664,163.6954,18.63982,77.00405,1.38853,18.18838,38.24905,2.52238,35.8792,9.08077,384.01743,21.36922,6.9111,559.1054,2.95941,0.14412,327.33435,0.28597,2.29055
50%,135.24265,521.0789,257.11545,29.01693,121.8922,2.31647,28.56839,61.06267,3.41979,48.47376,12.01238,544.8653,27.71758,12.81584,770.0264,4.64207,0.254,708.12,0.43604,6.28739
75%,270.9939,671.0417,358.3751,44.43137,162.93713,3.81326,39.81945,109.30135,4.77342,64.83919,16.13937,735.3778,37.10145,17.99064,1057.4165,7.94706,0.41822,1275.62625,0.57877,23.21081
max,805.7242,1828.192,669.4866,236.7205,255.2775,27.83175,74.38741,356.0612,11.15067,122.8545,34.42963,1621.572,73.59243,37.37759,2885.613,21.95344,1.56088,12034.2,1.15913,110.5156


In [318]:
subset_fat_asd = nutrition_fat_naiv[(nutrition_fat_naiv['Fat'] >= 10) & (nutrition_fat_naiv['Fat'] <= 15)].sample(n=8).copy()

In [319]:
subset_fat_asd.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,162.80304,334.50117,123.01987,32.78413,67.18158,4.00545,13.66888,91.52332,3.08246,48.43924,8.95656,608.06612,20.27056,5.84605,617.81806,6.73912,0.2728,1990.47048,0.43365,32.21983
std,118.70006,86.45515,11.0902,19.87579,11.43892,2.94701,1.23224,70.15791,1.11092,20.77865,2.33444,464.87719,4.59847,1.50548,349.37536,4.284,0.21137,4074.32996,0.31372,35.18379
min,42.2215,200.5096,96.83125,8.50724,53.31907,0.18479,10.75903,19.02514,1.61694,22.19329,6.48622,202.2782,11.25829,3.22836,132.1991,2.23212,0.06435,124.5866,0.16549,0.24869
25%,88.1635,287.39178,122.28388,19.57224,55.15042,2.41122,13.5871,54.77253,2.16691,37.15158,7.09761,287.17927,18.83971,5.27881,422.6006,3.92589,0.1452,405.76583,0.24251,1.50123
50%,117.79295,338.0464,127.1827,27.61447,71.05518,2.62511,14.13142,63.03995,3.1824,47.48286,8.56762,491.94015,21.3981,5.76511,665.69385,6.00521,0.20866,592.38995,0.35662,19.70184
75%,212.52058,394.0836,128.54217,54.38408,72.83828,6.15036,14.28247,106.04683,3.77064,54.7819,10.08627,733.3768,22.93564,6.74809,836.35553,8.35532,0.3526,786.9942,0.50485,61.95011
max,348.3654,451.0129,131.3139,55.97997,83.48,8.341,14.59043,232.0606,4.67316,88.1089,13.20469,1621.572,25.49099,8.03736,1054.259,15.31507,0.71992,12034.2,1.12891,85.117


In [320]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(new_recipe_db_wo_userrecipes_fat.sample(n=10, random_state=0).index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_random_10 = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_random_10.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_random_10.columns = nutrition_random_10.columns.droplevel(0)

# entferne alle NA
nutrition_random_10 = nutrition_random_10.dropna()

In [321]:
nutrition_random_10.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,137.90247,451.04469,211.85386,31.49681,82.84401,2.89383,23.53932,74.5543,3.44682,51.44868,12.6177,499.74915,28.10035,9.71581,719.19517,5.93412,0.27628,797.43848,0.47752,22.14323
std,128.19425,128.11832,112.91222,19.86946,23.58573,2.13513,12.5458,63.62439,1.83572,25.37871,4.65311,228.03533,6.53025,10.16515,447.6618,4.63905,0.22714,634.90664,0.12033,24.8011
min,32.86518,279.2421,108.2086,2.57644,41.49303,0.31548,12.02318,11.60304,1.22664,27.07817,7.11536,262.1083,21.15413,2.13052,82.90843,0.1345,0.08658,79.34875,0.27633,0.0
25%,45.73175,371.96933,135.81108,18.95888,68.72561,1.3943,15.09012,33.66391,1.97259,33.18492,8.49876,340.3263,22.94277,4.65956,304.59725,2.14126,0.16069,320.54398,0.38944,5.34257
50%,69.95123,428.61385,178.1548,29.86084,78.14209,2.72588,19.79498,53.02626,3.44418,38.86832,12.86435,400.11105,26.50746,7.84864,814.1019,6.20918,0.17573,718.8144,0.5167,11.51254
75%,195.9075,497.28538,240.62243,43.23842,99.89125,3.86521,26.73582,90.83728,4.3505,66.89249,15.29302,699.12432,31.21765,9.31209,1118.098,7.57461,0.32586,969.5833,0.55993,37.01577
max,376.1217,654.7633,490.6887,66.52579,118.39,7.56122,54.52096,199.0158,7.47058,100.5429,20.54744,874.2894,41.70995,37.4432,1285.509,13.47391,0.83562,2185.772,0.62032,72.18467


In [327]:
new_df = nutrition_random_10[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [329]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,451.04469,211.85386,31.49681,23.53932,9.71581,28.10035,137.90247,51.44868,3.44682
std,128.11832,112.91222,19.86946,12.5458,10.16515,6.53025,128.19425,25.37871,1.83572
min,279.2421,108.2086,2.57644,12.02318,2.13052,21.15413,32.86518,27.07817,1.22664
25%,371.96933,135.81108,18.95888,15.09012,4.65956,22.94277,45.73175,33.18492,1.97259
50%,428.61385,178.1548,29.86084,19.79498,7.84864,26.50746,69.95123,38.86832,3.44418
75%,497.28538,240.62243,43.23842,26.73582,9.31209,31.21765,195.9075,66.89249,4.3505
max,654.7633,490.6887,66.52579,54.52096,37.4432,41.70995,376.1217,100.5429,7.47058


In [322]:
nutrition_random_10.index.to_numpy()

array([  8600,  16849,  26615,  27819,  30794,  75672, 142951, 158429,
       177497, 216928])

In [333]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(user_recipes_fat.index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_user_recipes_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_user_recipes_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_user_recipes_fat.columns = nutrition_user_recipes_fat.columns.droplevel(0)

# entferne alle NA
nutrition_user_recipes_fat = nutrition_user_recipes_fat.dropna()

In [334]:
new_df = nutrition_user_recipes_fat[['Calories', 'Calories from Fat', 'Carbohydrates', 'Fat', 'Saturated Fat', 'Protein', 'Calcium', 'Magnesium', 'Iron' ]]

In [335]:
new_df.describe()

name,Calories,Calories from Fat,Carbohydrates,Fat,Saturated Fat,Protein,Calcium,Magnesium,Iron
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,563.83977,334.64292,24.81982,37.18255,15.10086,32.44829,144.61405,62.11656,3.07888
std,92.24917,14.74709,16.54758,1.63857,5.02136,10.22158,126.89385,32.62118,1.33454
min,390.7198,315.5125,1.27936,35.05695,5.39091,14.9787,22.91706,21.99075,0.58999
25%,500.82482,321.0012,13.55052,35.6668,12.2637,24.81829,57.0225,44.04216,2.2158
50%,549.52315,335.2659,24.81457,37.25176,14.09328,32.584,112.8676,54.51962,3.00261
75%,606.45907,347.85147,30.89026,38.65017,18.68514,38.42296,159.3108,70.92839,4.16262
max,754.5421,356.9908,68.65463,39.66564,22.90133,51.97705,448.9153,148.3127,5.45481


In [None]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(result_w_filter_10_fat[0:10].index.to_numpy())

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_jaccard_fat = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_jaccard_fat.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_jaccard_fat.columns = nutrition_jaccard_fat.columns.droplevel(0)

# entferne alle NA
nutrition_jaccard_fat = nutrition_jaccard_fat.dropna()