In [1]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [3]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))
#norm_data = json_normalize(data)

In [4]:
category = json_normalize(data, record_path='categories',
                          meta=['id', 'name'],  record_prefix='cat_')

In [5]:
# category['cat_name'].loc["Health"]

cat_health = category[category['cat_name'].str.contains("Health")]

In [6]:
cat_health = cat_health.drop_duplicates(subset='id', keep='first')

In [7]:
cat_health.id.values

array([85452, 8847, 236609, 53729, 45833, 30522, 75861, 16362, 16723,
       8665, 92528, 51283, 220854, 16715, 12072, 8580, 8842, 90089, 8571,
       70163, 15559, 73757, 14713, 142614, 93666, 136476, 19125, 46982,
       22970, 14526, 17048, 20415, 84109, 11789, 14537, 17511, 216688,
       89321, 51997, 34689, 73208, 9025, 8501, 21256, 24372, 16611,
       103144, 11669, 50658, 16160, 218769, 25418, 62423, 18465, 54030,
       17407, 43719, 17784, 222134, 21430, 79521, 165783, 15486, 44816,
       81310, 14751, 17522, 50726, 195083, 84043, 31780, 22246, 17715,
       12043, 8606, 20744, 86775, 106030, 8722, 19880, 60111, 25184,
       13199, 125658, 46819, 11915, 42657, 25939, 218929, 230857, 15641,
       22629, 167195, 215012, 8650, 92486, 78131, 81441, 11710, 9011,
       8601, 21518, 41998, 13999, 87163, 43926, 31813, 14722, 215436,
       217151, 17868, 151593, 11777, 85158, 13995, 74345, 36124, 11691,
       91894, 221256, 201964, 40061, 80686, 217186, 8844, 151620, 22765,
   

In [8]:
ingredients = json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore')

ingredients = pd.DataFrame(ingredients)

nutritions = json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count'])

In [9]:
# data cleansing

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
ingredients_filt = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) >= 5)

# drop alls rows with ingredients_id == 0
ingredients_filt = ingredients_filt[ingredients_filt.ingredients_id != 0]

# drop all duplicate
ingredients_filt = ingredients_filt.drop_duplicates(keep='first')

In [10]:
id_list = [215014, 8669, 16700, 16354, 12720, 8652, 8887, 51283, 45954, 213742, 14595,
           219164, 16348, 143082, 8665, 11758, 223042, 236609, 8778, 65896, 24264, 11679, 141678, 9023]

recipe_db = pd.get_dummies(ingredients_filt['ingredients_id']).groupby(
    ingredients_filt['id']).apply(max)

In [11]:
# filtere Rezepte aus den Metadaten anhand ihrerr Zutatenmenge raus.
not_wanted_recipes = recipe_db[recipe_db.mask(
    recipe_db == 0).count(axis=1) <= 4]

mask = nutritions['id'].isin(not_wanted_recipes.index.values)

nutritions_filt = nutritions.loc[~mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)
# remove multiindex 'amount'

nutrition_db.columns = nutrition_db.columns.droplevel(0)

nutrition_db = nutrition_db.dropna()

In [12]:
ingredients_filt

Unnamed: 0,ingredients_id,ingredients_name,ingredients_grams,ingredients_type,id
0,16157,10 g butter,11.36000,Normal,59661
1,4405,40 g sliced green onions,41.80000,Normal,59661
2,4342,"1-1/2 cloves garlic, minced",4.80000,Normal,59661
3,4520,3/4 (10 ounce) package frozen chopped spinach ...,227.20000,Normal,59661
4,16243,180 g ricotta cheese,182.40001,Normal,59661
...,...,...,...,...,...
61195,20551,1 g ground cumin,1.05000,Normal,244188
61196,16403,0.8 g dried oregano,0.75000,Normal,244188
61197,16406,1 g freshly ground black pepper,1.05000,Normal,244188
61198,6307,15 ml olive oil,13.50000,Normal,244188


In [13]:
nutritions_health = nutrition_db[nutrition_db.index.isin(cat_health.id.values)]
asd = ingredients_filt.groupby(['id']).sum()
ingredients_filt
asd

Unnamed: 0,ingredients_id,ingredients_name,ingredients_grams,ingredients_type,id
0,16157,10 g butter,11.36000,Normal,59661
1,4405,40 g sliced green onions,41.80000,Normal,59661
2,4342,"1-1/2 cloves garlic, minced",4.80000,Normal,59661
3,4520,3/4 (10 ounce) package frozen chopped spinach ...,227.20000,Normal,59661
4,16243,180 g ricotta cheese,182.40001,Normal,59661
...,...,...,...,...,...
61195,20551,1 g ground cumin,1.05000,Normal,244188
61196,16403,0.8 g dried oregano,0.75000,Normal,244188
61197,16406,1 g freshly ground black pepper,1.05000,Normal,244188
61198,6307,15 ml olive oil,13.50000,Normal,244188


Unnamed: 0_level_0,ingredients_id,ingredients_grams
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6806,76326,460.47999
6900,44576,184.35112
7198,137129,1234.50889
8493,78076,1280.50000
8494,99483,1671.56307
...,...,...
261608,69945,1546.33200
263611,164749,1175.86748
263744,62926,758.57852
263799,76888,785.59180


In [14]:
out1 = asd.hist(column='ingredients_grams', bins=200)
out1

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x123ddf750>]],
      dtype=object)

In [15]:
# nutritions_health
nutritions_health = nutritions_health.sort_values(by='Calories from Fat')
nutritions_health

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
14725,24.03237,67.54216,1.32300,18.46258,0.00000,0.93600,0.14700,4.35167,0.35016,8.68500,0.54284,152.89170,0.75050,0.01992,337.39490,15.72450,0.03643,285.30000,0.06808,18.40447
165532,12.98185,114.93330,2.32504,25.17798,0.04333,0.44817,0.25834,72.43833,1.35393,9.34270,1.74692,48.26603,2.29589,0.06748,66.40393,0.18982,0.17956,67.23751,0.05428,1.79992
35803,31.16767,61.83333,5.10690,5.12555,85.14999,1.04583,0.56743,8.63933,1.70515,21.50467,1.45319,174.52000,9.76845,0.14089,99.96799,1.55697,0.03980,283.92000,0.13160,44.87166
24372,8.92521,83.46584,5.70338,17.41987,0.00000,2.88225,0.63371,29.21729,1.03181,34.13521,2.55114,115.18500,3.52330,0.10776,196.03170,1.51910,0.14826,1.80583,0.10881,0.01050
12063,5.68380,65.77143,5.71100,12.74179,6.20000,0.45008,0.63456,31.28333,0.80872,3.87902,1.38326,20.09034,1.93133,0.11705,84.24599,0.05783,0.13187,8.11667,0.00972,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8754,45.55117,832.96520,330.41340,39.05152,254.01150,0.00000,36.71260,15.89833,2.95328,99.42744,55.00392,876.17760,82.75491,10.55735,276.51440,36.19400,0.25022,329.42120,2.10446,0.00000
40453,38.90650,579.22700,371.33980,3.56373,270.38750,0.37550,41.25998,8.98050,2.50524,50.08450,20.74376,561.53150,45.63825,13.48892,2024.52800,0.57236,0.14701,384.81400,0.67441,1.73025
232227,275.32630,617.48190,379.45950,30.78222,136.35440,2.35200,42.16217,90.80875,2.87025,47.59312,11.61752,470.46000,28.36177,24.26494,721.00810,6.00081,0.36326,1364.04100,0.31606,1.18469
125658,541.87190,797.30390,467.06560,33.23262,171.83180,4.34834,51.89617,43.92533,2.66490,89.94234,19.34673,510.91720,49.77515,24.63747,1013.36600,2.86032,0.18634,1492.52700,0.55105,17.25710


In [16]:
recipe_db_filt = recipe_db[recipe_db.mask(recipe_db == 0).count(axis=1) >= 5]

user_nutrition = nutrition_db[nutrition_db.index.isin(id_list)]

user_recipes = recipe_db_filt[recipe_db_filt.index.isin(id_list)]

In [17]:
nutrition_db

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6806,90.66800,285.29200,99.71441,34.16718,28.97886,1.44297,11.07938,95.69036,2.29582,16.86080,6.63480,104.79320,11.28243,5.07081,623.46150,2.86116,0.39179,258.77250,0.07336,1.83020
6900,117.79860,105.27020,21.33330,17.98829,1.62667,0.56557,2.37037,39.16360,1.14346,7.30480,1.82102,54.07343,2.84282,0.59495,385.75980,1.72141,0.17169,116.75390,0.01709,0.04505
7198,244.16520,731.50920,325.65230,73.29574,88.69389,4.64147,36.18359,249.14210,5.76566,52.89296,13.51325,509.34610,27.58195,11.50691,1660.20300,2.83424,0.89150,699.95980,0.36319,67.32533
8493,415.52450,454.28300,177.75310,23.83120,203.86850,1.99500,19.75034,63.05025,3.11258,54.35775,23.43780,509.41500,44.07647,9.96063,1107.91100,3.45304,0.17406,578.21420,0.64563,1.91250
8494,58.08641,834.44480,514.68470,4.76954,283.75000,0.63600,57.18719,33.44816,3.72336,84.44925,39.18463,807.96980,71.10712,16.35430,567.97140,2.08741,0.02921,619.52510,1.37244,20.17546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261608,115.27280,324.28000,83.41859,53.91319,0.00000,13.90981,9.26873,233.83750,3.83645,90.64800,4.47335,1234.76200,12.64116,1.20919,1075.47200,5.41034,0.17184,1102.67700,1.27989,137.70680
263611,415.72130,612.45070,291.35620,52.63378,74.50077,3.43897,32.37291,112.82310,3.88642,32.95200,7.44909,214.92120,25.41999,14.22969,1486.61300,4.82439,0.61472,957.93720,0.17557,9.86299
263744,53.87948,297.14870,172.23010,8.85126,68.46000,1.81759,19.13668,42.33592,2.79045,40.50691,8.94936,465.41230,22.36788,6.61654,981.82850,4.86679,0.09926,37.37281,0.41305,24.50144
263799,243.66410,238.61440,140.21900,11.09269,47.81817,2.63082,15.57989,27.52521,1.27462,28.15549,4.92735,401.64490,14.76595,7.06539,734.54530,4.87067,0.09536,554.07250,0.38554,70.95989


In [18]:
# create a function called times100
def portion_to_6(x):
    a = x / 4
    a = a * 2
    x = x + a
    return x

In [19]:
ingredients_filt['ingredients_grams'] = ingredients_filt['ingredients_grams'].apply(
    portion_to_6)

In [20]:
ingredients_filt

Unnamed: 0,ingredients_id,ingredients_name,ingredients_grams,ingredients_type,id
0,16157,10 g butter,17.04000,Normal,59661
1,4405,40 g sliced green onions,62.70000,Normal,59661
2,4342,"1-1/2 cloves garlic, minced",7.20000,Normal,59661
3,4520,3/4 (10 ounce) package frozen chopped spinach ...,340.80000,Normal,59661
4,16243,180 g ricotta cheese,273.60001,Normal,59661
...,...,...,...,...,...
61195,20551,1 g ground cumin,1.57500,Normal,244188
61196,16403,0.8 g dried oregano,1.12500,Normal,244188
61197,16406,1 g freshly ground black pepper,1.57500,Normal,244188
61198,6307,15 ml olive oil,20.25000,Normal,244188


In [21]:
asd2 = ingredients_filt.groupby(['id']).sum()
asd2
recipe_summed_weight = asd2.drop(columns='ingredients_id')
recipe_summed_weight.describe()

Unnamed: 0_level_0,ingredients_id,ingredients_grams
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6806,76326,690.71998
6900,44576,276.52667
7198,137129,1851.76333
8493,78076,1920.75000
8494,99483,2507.34460
...,...,...
261608,69945,2319.49800
263611,164749,1763.80122
263744,62926,1137.86779
263799,76888,1178.38770


Unnamed: 0,ingredients_grams
count,6305.0
mean,1688.27048
std,850.08323
min,7.7
25%,1121.00012
50%,1556.73721
75%,2094.94062
max,11862.73


In [22]:
test = nutrition_db.join(recipe_summed_weight, how='inner')
test

Unnamed: 0_level_0,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,...,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C,ingredients_grams
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6806,90.66800,285.29200,99.71441,34.16718,28.97886,1.44297,11.07938,95.69036,2.29582,16.86080,...,104.79320,11.28243,5.07081,623.46150,2.86116,0.39179,258.77250,0.07336,1.83020,690.71998
6900,117.79860,105.27020,21.33330,17.98829,1.62667,0.56557,2.37037,39.16360,1.14346,7.30480,...,54.07343,2.84282,0.59495,385.75980,1.72141,0.17169,116.75390,0.01709,0.04505,276.52667
7198,244.16520,731.50920,325.65230,73.29574,88.69389,4.64147,36.18359,249.14210,5.76566,52.89296,...,509.34610,27.58195,11.50691,1660.20300,2.83424,0.89150,699.95980,0.36319,67.32533,1851.76333
8493,415.52450,454.28300,177.75310,23.83120,203.86850,1.99500,19.75034,63.05025,3.11258,54.35775,...,509.41500,44.07647,9.96063,1107.91100,3.45304,0.17406,578.21420,0.64563,1.91250,1920.75000
8494,58.08641,834.44480,514.68470,4.76954,283.75000,0.63600,57.18719,33.44816,3.72336,84.44925,...,807.96980,71.10712,16.35430,567.97140,2.08741,0.02921,619.52510,1.37244,20.17546,2507.34460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261608,115.27280,324.28000,83.41859,53.91319,0.00000,13.90981,9.26873,233.83750,3.83645,90.64800,...,1234.76200,12.64116,1.20919,1075.47200,5.41034,0.17184,1102.67700,1.27989,137.70680,2319.49800
263611,415.72130,612.45070,291.35620,52.63378,74.50077,3.43897,32.37291,112.82310,3.88642,32.95200,...,214.92120,25.41999,14.22969,1486.61300,4.82439,0.61472,957.93720,0.17557,9.86299,1763.80122
263744,53.87948,297.14870,172.23010,8.85126,68.46000,1.81759,19.13668,42.33592,2.79045,40.50691,...,465.41230,22.36788,6.61654,981.82850,4.86679,0.09926,37.37281,0.41305,24.50144,1137.86779
263799,243.66410,238.61440,140.21900,11.09269,47.81817,2.63082,15.57989,27.52521,1.27462,28.15549,...,401.64490,14.76595,7.06539,734.54530,4.87067,0.09536,554.07250,0.38554,70.95989,1178.38770


In [23]:
def capitalizer(x): return x.upper()


# Cholesterol = mg
# Sodium = mg
# Folate = mcg
# Magnesium = mg
# Vitamin B6 = mg
# Niacin Equivalents = mg
# Thiamin = mg
# Iron = mg
# Calcium = mg
# Vitamin C = mg
# Potassium = mg
# Vitamin A - IU = 1IU = 0.6 mcg
test['Cholesterol'] = test['Cholesterol'].apply(lambda x: x / 1000)
test['Sodium'] = test['Sodium'].apply(lambda x: x / 1000)
test['Magnesium'] = test['Magnesium'].apply(lambda x: x / 1000)
test['Vitamin B6'] = test['Vitamin B6'].apply(lambda x: x / 1000)
test['Niacin Equivalents'] = test['Niacin Equivalents'].apply(lambda x: x / 1000)
test['Thiamin'] = test['Thiamin'].apply(lambda x: x / 1000)
test['Iron'] = test['Iron'].apply(lambda x: x / 1000)
test['Calcium'] = test['Calcium'].apply(lambda x: x / 1000)
test['Vitamin C'] = test['Vitamin C'].apply(lambda x: x / 1000)
test['Potassium'] = test['Potassium'].apply(lambda x: x / 1000)

test['Folate'] = test['Folate'].apply(lambda x: x / 1000000)
test['Vitamin A - IU'] = test['Vitamin A - IU'].apply(lambda x: x * 0,6/ 1000000)


In [24]:
test

Unnamed: 0_level_0,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,...,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C,ingredients_grams
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6806,0.09067,285.29200,99.71441,34.16718,0.02898,1.44297,11.07938,0.00010,0.00230,0.01686,...,0.10479,11.28243,5.07081,0.62346,2.86116,0.00039,0.00000,0.00007,0.00183,690.71998
6900,0.11780,105.27020,21.33330,17.98829,0.00163,0.56557,2.37037,0.00004,0.00114,0.00730,...,0.05407,2.84282,0.59495,0.38576,1.72141,0.00017,0.00000,0.00002,0.00005,276.52667
7198,0.24417,731.50920,325.65230,73.29574,0.08869,4.64147,36.18359,0.00025,0.00577,0.05289,...,0.50935,27.58195,11.50691,1.66020,2.83424,0.00089,0.00000,0.00036,0.06733,1851.76333
8493,0.41552,454.28300,177.75310,23.83120,0.20387,1.99500,19.75034,0.00006,0.00311,0.05436,...,0.50942,44.07647,9.96063,1.10791,3.45304,0.00017,0.00000,0.00065,0.00191,1920.75000
8494,0.05809,834.44480,514.68470,4.76954,0.28375,0.63600,57.18719,0.00003,0.00372,0.08445,...,0.80797,71.10712,16.35430,0.56797,2.08741,0.00003,0.00000,0.00137,0.02018,2507.34460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261608,0.11527,324.28000,83.41859,53.91319,0.00000,13.90981,9.26873,0.00023,0.00384,0.09065,...,1.23476,12.64116,1.20919,1.07547,5.41034,0.00017,0.00000,0.00128,0.13771,2319.49800
263611,0.41572,612.45070,291.35620,52.63378,0.07450,3.43897,32.37291,0.00011,0.00389,0.03295,...,0.21492,25.41999,14.22969,1.48661,4.82439,0.00061,0.00000,0.00018,0.00986,1763.80122
263744,0.05388,297.14870,172.23010,8.85126,0.06846,1.81759,19.13668,0.00004,0.00279,0.04051,...,0.46541,22.36788,6.61654,0.98183,4.86679,0.00010,0.00000,0.00041,0.02450,1137.86779
263799,0.24366,238.61440,140.21900,11.09269,0.04782,2.63082,15.57989,0.00003,0.00127,0.02816,...,0.40164,14.76595,7.06539,0.73455,4.87067,0.00010,0.00000,0.00039,0.07096,1178.38770


In [25]:
percent_test = test

In [26]:
percent_test[0:16]

Unnamed: 0_level_0,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,...,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C,ingredients_grams
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6806,0.09067,285.292,99.71441,34.16718,0.02898,1.44297,11.07938,0.0001,0.0023,0.01686,...,0.10479,11.28243,5.07081,0.62346,2.86116,0.00039,0.0,7e-05,0.00183,690.71998
6900,0.1178,105.2702,21.3333,17.98829,0.00163,0.56557,2.37037,4e-05,0.00114,0.0073,...,0.05407,2.84282,0.59495,0.38576,1.72141,0.00017,0.0,2e-05,5e-05,276.52667
7198,0.24417,731.5092,325.6523,73.29574,0.08869,4.64147,36.18359,0.00025,0.00577,0.05289,...,0.50935,27.58195,11.50691,1.6602,2.83424,0.00089,0.0,0.00036,0.06733,1851.76333
8493,0.41552,454.283,177.7531,23.8312,0.20387,1.995,19.75034,6e-05,0.00311,0.05436,...,0.50942,44.07647,9.96063,1.10791,3.45304,0.00017,0.0,0.00065,0.00191,1920.75
8494,0.05809,834.4448,514.6847,4.76954,0.28375,0.636,57.18719,3e-05,0.00372,0.08445,...,0.80797,71.10712,16.3543,0.56797,2.08741,3e-05,0.0,0.00137,0.02018,2507.3446
8495,0.37901,418.7534,174.2374,12.62089,0.12359,0.75239,19.35971,3e-05,0.00189,0.06048,...,0.45704,46.06678,10.03598,0.93158,1.42275,0.0003,0.0,0.00083,0.01173,1225.89375
8497,0.29699,568.1025,261.8146,34.10004,0.14045,1.19467,29.09051,7e-05,0.00287,0.06228,...,0.45927,40.69365,16.40447,1.17082,4.22334,0.00033,0.0,0.00073,0.00149,1581.60005
8498,0.08786,676.1752,516.6072,19.00183,0.06088,0.31272,57.40079,2e-05,0.00434,0.03188,...,0.77512,23.26001,7.79435,0.717,8.45308,5e-05,0.0,0.00031,0.03702,1816.165
8500,0.10103,506.5928,298.1711,20.03668,0.16616,4.55114,33.13012,5e-05,0.0034,0.06948,...,0.42226,34.40512,4.66727,0.25007,2.36547,0.00039,0.0,0.00063,0.00057,1272.8625
8501,0.02638,125.2633,18.23897,10.70957,0.03358,0.65665,2.02655,2e-05,0.00112,0.01815,...,0.14716,15.00359,0.53461,0.26284,0.80519,0.00014,0.0,0.00027,0.00023,505.05875


In [27]:
# create a function called times100
# def percent_to_1000(x):
#     a = x * 100 / x.ingredients_grams
#     if x == x.ingredients_grams:
#         return
# #     a = a * 2
# #     x = x + a
#     return a

# asd = percent_test.apply(percent_to_1000)

In [28]:
# # create a function called times100
# def percent_to_1000(x):
#     a = x * 100 / x.ingredients_grams
# #     if x == x.ingredients_grams:
# #         return
# #     a = a * 2
# #     x = x + a
#     return a
# #     print(x.ingredients_grams)

# asd = percent_test.apply(percent_to_1000, axis = 1)

In [29]:
# percent_test[:1].T
# asd[:1].T


In [30]:
# out1 = asd2.hist(column='ingredients_grams', bins=200)
# out1

In [31]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db_filt.index.isin(nutrition_db.index.values)
recipe_db = recipe_db_filt.loc[mask]

recipe_db

Unnamed: 0_level_0,111,126,257,388,443,445,578,615,629,631,...,24850,24865,25518,25522,26269,26706,26934,27343,27388,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6900,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8493,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261608,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
263611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
263744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
263799,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# jaccard Distanz - rezept zu rezept vergleich

result_array = cdist(user_recipes, recipe_db, 'jaccard')
jacc_distance = pd.DataFrame(
    result_array, columns=recipe_db.index.values, index=user_recipes.index.values)

jaccard_distance_sum = pd.DataFrame(
    jacc_distance.sum(), columns=['jaccard_distance_sum'])
jaccard_distance_sum = jaccard_distance_sum.sort_values(
    by='jaccard_distance_sum')

jaccard_distance_sum

Unnamed: 0,jaccard_distance_sum
102235,19.33806
11732,19.37608
25927,19.45397
16416,19.53403
8887,19.57872
...,...
15699,23.00000
82685,23.00000
236054,23.00000
24878,23.00000


In [33]:
# Euklidische Distanz - rezept zu rezept vergelich

gesund = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

result_array = cdist(user_nutrition, nutrition_db, 'minkowski', p=2)
# result_array = cdist(user_nutrition, nutrition_db,'minkowski', p = 2)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db.index.values, index=user_nutrition.index.values)

euclid_distance_sum = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum = euclid_distance_sum.sort_values(by='euclid_distance_sum')
euclid_distance_sum

Unnamed: 0,euclid_distance_sum
72845,16845.99911
180791,16895.05504
213742,16972.68289
8621,16977.63899
217080,17040.46949
...,...
89338,845812.46986
58942,848915.34215
132097,875255.57055
216035,915192.05417


In [34]:
# outliers = euclid_distance_sum[euclid_distance_sum['ingredients_grams'] >
#                                euclid_distance_sum['ingredients_grams'].mean() + 1 * euclid_distance_sum['ingredients_grams'].std()]

# euclid_distance_sum_no_outliers.drop(outliers.index, inplace=True)

# jaccard_distance_sum_no_outliers = jaccard_distance_sum_no_outliers[jaccard_distance_sum_no_outliers.index.isin(
#     euclid_distance_sum_no_outliers.index)]

In [35]:
# Euklidische Distanz - rezept zu rezept vergelich
# minkowski(u, v, p=2, w=None)
# rec = []
# rec = {
#     "Calcium":[1],
#     "Calories":[1],
#     "Calories from Fat":[1],
#     "Carbohydrates":[1],
#     "Cholesterol":[1],
#     "Dietary Fiber":[1],
#     "Fat":[1],
#     "Folate":[1],
#     "Iron":[1],
#     "Magnesium":[1],
#     "Niacin Equivalents":[1],
#     "Potassium":[1],
#     "Protein":[1],
#     "Saturated Fat":[1],
#     "Sodium":[1],
#     "Sugars":[1],
#     "Thiamin":[1],
#     "Vitamin A - IU":[1],
#     "Vitamin B6":[1],
#     "Vitamin C":[1]}

rec_2 = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

result_array = cdist(user_nutrition, nutrition_db, 'minkowski', p=2, w=rec_2)
# result_array = cdist(user_nutrition, nutrition_db,'minkowski', p = 2)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db.index.values, index=user_nutrition.index.values)

euclid_distance_sum = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum = euclid_distance_sum.sort_values(by='euclid_distance_sum')
euclid_distance_sum

Unnamed: 0,euclid_distance_sum
72845,16845.99911
180791,16895.05504
213742,16972.68289
8621,16977.63899
217080,17040.46949
...,...
89338,845812.46986
58942,848915.34215
132097,875255.57055
216035,915192.05417


In [36]:
euclid_distance_sum_no_outliers = euclid_distance_sum
jaccard_distance_sum_no_outliers = jaccard_distance_sum

In [37]:
outliers = euclid_distance_sum[euclid_distance_sum['euclid_distance_sum'] >
                               euclid_distance_sum['euclid_distance_sum'].mean() + 1 * euclid_distance_sum['euclid_distance_sum'].std()]

euclid_distance_sum_no_outliers.drop(outliers.index, inplace=True)

jaccard_distance_sum_no_outliers = jaccard_distance_sum_no_outliers[jaccard_distance_sum_no_outliers.index.isin(
    euclid_distance_sum_no_outliers.index)]

In [38]:
jaccard_distance_sum_no_outliers

Unnamed: 0,jaccard_distance_sum
102235,19.33806
11732,19.37608
25927,19.45397
16416,19.53403
8887,19.57872
...,...
15699,23.00000
82685,23.00000
236054,23.00000
24878,23.00000


In [39]:
euclid_distance_sum_no_outliers.describe()
jaccard_distance_sum_no_outliers.describe()

Unnamed: 0,euclid_distance_sum
count,5434.0
mean,29997.93034
std,15650.09376
min,16845.99911
25%,20858.03137
50%,24562.11088
75%,31162.62658
max,105596.43135


Unnamed: 0,jaccard_distance_sum
count,5434.0
mean,21.47487
std,0.62816
min,19.33806
25%,21.03498
50%,21.47187
75%,21.91128
max,23.0


In [40]:
# copy original data so it wont be altered
euclid_distance_sum_w_outliers = euclid_distance_sum
jaccard_distance_sum_w_outliers = jaccard_distance_sum

In [41]:
# copy original data back
euclid_distance_sum = euclid_distance_sum_w_outliers
jaccard_distance_sum = jaccard_distance_sum_w_outliers

In [42]:
# copy data with outliers
euclid_distance_sum = euclid_distance_sum_no_outliers
jaccard_distance_sum = jaccard_distance_sum_no_outliers

In [43]:
euclid_distance_sum.describe()
jaccard_distance_sum.describe()

Unnamed: 0,euclid_distance_sum
count,5434.0
mean,29997.93034
std,15650.09376
min,16845.99911
25%,20858.03137
50%,24562.11088
75%,31162.62658
max,105596.43135


Unnamed: 0,jaccard_distance_sum
count,5434.0
mean,21.47487
std,0.62816
min,19.33806
25%,21.03498
50%,21.47187
75%,21.91128
max,23.0


In [44]:
# minmax nomralisierung
from sklearn import preprocessing

nut_res = euclid_distance_sum
x = nut_res.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
nut_res = pd.DataFrame(x_scaled, columns=nut_res.columns, index=nut_res.index)


jac_res = jaccard_distance_sum
z = jac_res.values
min_max_scaler = preprocessing.MinMaxScaler()
z_scaled = min_max_scaler.fit_transform(z)
jac_res = pd.DataFrame(z_scaled, columns=jac_res.columns, index=jac_res.index)

min_max_nut_recipe = pd.DataFrame(
    nut_res.euclid_distance_sum + jac_res.jaccard_distance_sum, columns=jac_res.columns, index=jac_res.index)

min_max_nut_recipe = min_max_nut_recipe.sort_values(by='jaccard_distance_sum')

min_max_nut_recipe.rename(
    columns={'jaccard_distance_sum': 'min_max'}, inplace=True)

In [45]:
# robust-normalisierung
from sklearn import preprocessing

nut_res_robust = euclid_distance_sum
x = nut_res_robust.values
robust_scaler = preprocessing.RobustScaler()
x_scaled = robust_scaler.fit_transform(x)
nut_res_robust = pd.DataFrame(
    x_scaled, columns=nut_res_robust.columns, index=nut_res_robust.index)

jac_res_robust = jaccard_distance_sum
z = jac_res_robust.values
robust_scaler = preprocessing.RobustScaler()
z_scaled = robust_scaler.fit_transform(z)

jac_res_robust = pd.DataFrame(
    z_scaled, columns=jac_res_robust.columns, index=jac_res_robust.index)


robust_scaling_nut_recipe = pd.DataFrame(nut_res_robust.euclid_distance_sum +
                                         jac_res_robust.jaccard_distance_sum, columns=jac_res_robust.columns, index=jac_res_robust.index)

robust_scaling_nut_recipe = robust_scaling_nut_recipe.sort_values(
    by='jaccard_distance_sum')

robust_scaling_nut_recipe.rename(
    columns={'jaccard_distance_sum': 'robust_scaling'}, inplace=True)

In [46]:
# z-score-mormalisierung
z_min = (euclid_distance_sum - euclid_distance_sum.mean()) / \
    euclid_distance_sum.std()

z_jac = (jaccard_distance_sum - jaccard_distance_sum.mean()) / \
    jaccard_distance_sum.std()

z_score_nut_recipe = pd.DataFrame(z_min.euclid_distance_sum + z_jac.jaccard_distance_sum,
                                  columns=euclid_distance_sum.columns, index=z_min.index)

z_score_nut_recipe = z_score_nut_recipe.sort_values(by='euclid_distance_sum')

z_score_nut_recipe.rename(
    columns={'euclid_distance_sum': 'z_score'}, inplace=True)

In [47]:
display(z_score_nut_recipe, robust_scaling_nut_recipe, min_max_nut_recipe)

Unnamed: 0,z_score
25927,-3.94362
102235,-3.78660
16416,-3.77637
51372,-3.76155
8887,-3.72658
...,...
14670,5.79018
49769,5.81864
229874,5.95585
178054,6.10640


Unnamed: 0,robust_scaling
25927,-2.87850
51372,-2.78256
16416,-2.72670
8887,-2.70813
223042,-2.67911
...,...
14670,8.12104
160197,8.41404
20082,8.43923
178054,8.47966


Unnamed: 0,min_max
25927,0.05174
102235,0.08032
16416,0.08062
51372,0.08277
8887,0.08906
...,...
14670,1.74553
49769,1.74712
229874,1.77192
178054,1.80056
