In [1]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"




In [2]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [3]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [4]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [5]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)

#ingredients_filt

In [6]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hat
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind
nutrition_db.index.nunique()
ingredients_db.index.get_level_values('id').nunique()

4062

4062

In [7]:
id_list = [215014, 8669, 16700, 16354, 12720, 8652, 8887, 51283, 45954, 213742, 14595,
           219164, 16348, 143082, 8665, 11758, 223042, 236609, 8778, 65896, 24264, 11679, 141678, 9023]

ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [8]:
test = ingredients_db.copy()

In [9]:
# Funktion um das Gewicht der einzelnen Zutaten von vier portionen auf 1 portionen zu erhöhen
def portion_to_1(x):
    a = x / 4
#     a = a * 2
#     x = x + a
    return a


#rechne 
# test['ingredients_grams'] = test['ingredients_grams'].apply(
#     portion_to_1)


In [10]:
# recipe_summed_weight = test.groupby(['id']).sum()
# # asd2
# recipe_summed_weight = recipe_summed_weight.drop(columns='ingredients_id')

# # recipe_summed_weight
# # recipe_summed_weight

# recipe_summed_weight = recipe_summed_weight.sort_values(by='ingredients_grams')

# recipe_summed_weight

In [11]:
# rechne nährwerte auf 800 kcal pro rezept um
def nutrition_to_800(x):
    a = x / x.Calories
    b = a * 800
    return b



In [12]:
nutrition_db_800 = nutrition_db.apply(nutrition_to_800, axis = 1)


In [13]:
user_nutrition = nutrition_db_800[nutrition_db_800.index.isin(id_list)]
user_recipes = recipe_db[recipe_db.index.isin(id_list)]

In [14]:
print(user_recipes.values)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [15]:
user_recipes

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8652,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8665,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8669,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8778,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8887,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12720,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
nutrition_db
nutrition_db_1000

# data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
# df = pd.DataFrame(data)
# print df

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6900,117.79860,105.27020,21.33330,17.98829,1.62667,0.56557,2.37037,39.16360,1.14346,7.30480,1.82102,54.07343,2.84282,0.59495,385.75980,1.72141,0.17169,116.75390,0.01709,0.04505
7198,244.16520,731.50920,325.65230,73.29574,88.69389,4.64147,36.18359,249.14210,5.76566,52.89296,13.51325,509.34610,27.58195,11.50691,1660.20300,2.83424,0.89150,699.95980,0.36319,67.32533
8493,415.52450,454.28300,177.75310,23.83120,203.86850,1.99500,19.75034,63.05025,3.11258,54.35775,23.43780,509.41500,44.07647,9.96063,1107.91100,3.45304,0.17406,578.21420,0.64563,1.91250
8494,58.08641,834.44480,514.68470,4.76954,283.75000,0.63600,57.18719,33.44816,3.72336,84.44925,39.18463,807.96980,71.10712,16.35430,567.97140,2.08741,0.02921,619.52510,1.37244,20.17546
8495,379.01410,418.75340,174.23740,12.62089,123.58900,0.75239,19.35971,25.41156,1.89411,60.48306,24.62857,457.03550,46.06678,10.03598,931.58340,1.42275,0.29605,406.88370,0.82640,11.72893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258878,9.30933,189.09670,106.42770,0.12118,70.99000,0.01650,11.82530,5.34667,1.02767,16.89067,8.44069,171.30030,19.15713,3.30546,90.00200,0.04050,0.05192,125.95000,0.24153,0.03000
259870,116.87750,533.78500,254.83060,65.83649,0.00000,12.46288,28.31451,179.44000,4.86495,140.72750,7.50368,2441.57700,11.27299,4.00073,141.41000,16.18104,0.55845,2659.15000,1.63708,135.49670
260193,37.24250,300.08690,43.28884,13.86486,118.04670,1.03600,4.80987,7.15000,1.63545,46.19250,21.66052,476.54750,45.85212,1.41095,1545.54100,4.58523,0.08974,217.95000,0.61491,1.23025
261124,36.90501,287.12290,135.42690,3.59909,96.13005,0.67217,15.04743,18.83792,2.01199,33.50583,16.46910,372.90480,32.54562,3.21560,293.74010,1.22104,0.09603,527.56960,0.57170,9.79475


NameError: name 'nutrition_db_1000' is not defined

In [None]:
#####################  filtere nach ids die nur in der kategorie Main vorkommen. ##################################

In [None]:
category = json_normalize(data, record_path='categories',
                          meta=['id', 'name'],  record_prefix='cat_')

In [None]:
cat_main = category[category['cat_name'].str.contains("Main")]

In [None]:
nutrition_db_1000.Fat.describe()

In [None]:
df2 = pd.DataFrame(data=None, columns=nutrition_db_800.columns, index=None)
# df2 = df2.drop(columns='ingredients_grams')
df2.T

In [None]:
# 19–30 empfohlene nährwerte pro tag #https://www.ncbi.nlm.nih.gov/books/NBK56068/table/summarytables.t4/?report=objectonly
df2.loc['index'] = [
    # "Calcium":[1],
    333.3,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    0,
    # "Carbohydrates":[1],
    43.3,
    # "Cholesterol":[1],
    0,
    # "Dietary Fiber	":[1],
    8.3,
    # "Fat":[1],
    30,
    # "Folate":[1],
    133.3,
    # "Iron":[1],
    2.6,
    # "Magnesium":[1],
    133.3,
    # "Niacin Equivalents":[1],
    5.3,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    18.6,
    # "Saturated Fat":[1],
    0,
    # "Sodium":[1],
    0.5,
    # "Sugars":[1],
    0,
    # "Thiamin":[1],
    0.4,
    # "Vitamin A - IU":[1],
    300,
    # "Vitamin B6":[1],
    0.43,
    # "Vitamin C":[1]}
    30]
# df2 = df2.iloc[1:]
df2


In [None]:
# euklidische distanz für nährwerte
no_weight = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

rec_2 = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1000,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1000,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    100,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    100,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

result_array = cdist(df2, nutrition_db, 'minkowski', p=2, w=rec_2)
# result_array = cdist(user_nutrition, nutrition_db,'minkowski', p = 2)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db.index.values, index=df2.index.values)

euclid_distance_sum = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum = euclid_distance_sum.sort_values(by='euclid_distance_sum')
euclid_distance_sum

In [None]:
from scipy import stats
asd = euclid_distance_sum[(np.abs(stats.zscore(euclid_distance_sum)) < 3).all(axis=1)]
asd

In [None]:
nut_res_hist = asd.hist(column='euclid_distance_sum', bins=500)


In [None]:
euclid_distance_sum_no_outliers = euclid_distance_sum

outliers = euclid_distance_sum[euclid_distance_sum['euclid_distance_sum'] >
                               euclid_distance_sum['euclid_distance_sum'].mean() + 3 * euclid_distance_sum['euclid_distance_sum'].std()]


euclid_distance_sum_no_outliers.drop(outliers.index, inplace=True)
euclid_distance_sum_no_outliers.shape

In [None]:
euclid_distance_sum = euclid_distance_sum_no_outliers

In [None]:
# minmax nomralisierung
from sklearn import preprocessing

nut_res = euclid_distance_sum
x = nut_res.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
nut_res = pd.DataFrame(x_scaled, columns=nut_res.columns, index=nut_res.index)


# jac_res = jaccard_distance_sum
# z = jac_res.values
# min_max_scaler = preprocessing.MinMaxScaler()
# z_scaled = min_max_scaler.fit_transform(z)
# jac_res = pd.DataFrame(z_scaled, columns=jac_res.columns, index=jac_res.index)

In [None]:
nut_res

In [None]:

nut_res_hist = nut_res.hist(column='euclid_distance_sum', bins=500)


In [None]:
nut_res_hist = euclid_distance_sum_no_outliers.hist(column='euclid_distance_sum', bins=500)


In [None]:
top10 = euclid_distance_sum[:10].index.values
top10

In [None]:
# jaccard Distanz - rezept zu rezept vergleich

result_array = cdist(user_recipes, recipe_db, 'jaccard')
jacc_distance = pd.DataFrame(
    result_array, columns=recipe_db.index.values, index=user_recipes.index.values)

jaccard_distance_sum = pd.DataFrame(
    jacc_distance.sum(), columns=['jaccard_distance_sum'])
jaccard_distance_sum = jaccard_distance_sum.sort_values(
    by='jaccard_distance_sum')

jaccard_distance_sum

In [None]:
jacc_distance[[25927]]

In [None]:
from scipy import stats
asd = jaccard_distance_sum[(np.abs(stats.zscore(jaccard_distance_sum)) < 2).all(axis=1)]
asd

In [None]:
jac_res_hist = asd.hist(column='jaccard_distance_sum', bins=500)


In [None]:
# euclid_distance_sum_no_outliers = euclid_distance_sum
# jaccard_distance_sum_no_outliers = jaccard_distance_sum


outliers = (jaccard_distance_sum['jaccard_distance_sum'] - jaccard_distance_sum['jaccard_distance_sum'].mean()) / jaccard_distance_sum['jaccard_distance_sum'].std()

df2 = pd.DataFrame(outliers)
df2

# jaccard_distance_sum_no_outliers.drop(outliers.index, inplace=True)
# euclid_distance_sum_no_outliers.drop(outliers.index, inplace=True)

#jaccard_distance_sum_no_outliers = jaccard_distance_sum_no_outliers[jaccard_distance_sum_no_outliers.index.isin(euclid_distance_sum_no_outliers.index)]

In [None]:
euclid_distance_sum.describe()

In [None]:
euclid_distance_sum

In [None]:
# euclid_distance_sum_no_outliers = euclid_distance_sum
# jaccard_distance_sum_no_outliers = jaccard_distance_sum


outliers = (euclid_distance_sum['euclid_distance_sum'] - euclid_distance_sum['euclid_distance_sum'].mean()) / euclid_distance_sum['euclid_distance_sum'].std()
df = pd.DataFrame(outliers)
df
# jaccard_distance_sum_no_outliers.drop(outliers.index, inplace=True)
# euclid_distance_sum_no_outliers.drop(outliers.index, inplace=True)

#jaccard_distance_sum_no_outliers = jaccard_distance_sum_no_outliers[jaccard_distance_sum_no_outliers.index.isin(euclid_distance_sum_no_outliers.index)]

In [None]:
jaccard_distance_sum

In [None]:
jaccard_distance_sum.describe()
euclid_distance_sum.describe()

In [None]:
euclid_distance_sum_no_outliers.describe()
jaccard_distance_sum_no_outliers.describe()

In [None]:
# copy data with outliers
euclid_distance_sum = euclid_distance_sum_no_outliers
jaccard_distance_sum = jaccard_distance_sum_no_outliers

In [None]:
euclid_distance_sum_no_outliers = euclid_distance_sum
jaccard_distance_sum_no_outliers = jaccard_distance_sum


outliers = jaccard_distance_sum_no_outliers[jaccard_distance_sum_no_outliers['jaccard_distance_sum'] >
                               jaccard_distance_sum_no_outliers['jaccard_distance_sum'].mean() + 1 * jaccard_distance_sum_no_outliers['jaccard_distance_sum'].std()]

jaccard_distance_sum_no_outliers.drop(outliers.index, inplace=True)
euclid_distance_sum_no_outliers.drop(outliers.index, inplace=True)

#jaccard_distance_sum_no_outliers = jaccard_distance_sum_no_outliers[jaccard_distance_sum_no_outliers.index.isin(euclid_distance_sum_no_outliers.index)]

In [None]:
euclid_distance_sum_no_outliers.describe()
jaccard_distance_sum_no_outliers.describe()

In [None]:
# copy data with outliers
euclid_distance_sum = euclid_distance_sum_no_outliers
jaccard_distance_sum = jaccard_distance_sum_no_outliers

In [None]:
addition_nut_jac = pd.DataFrame(euclid_distance_sum.euclid_distance_sum *
                                         jaccard_distance_sum.jaccard_distance_sum, columns=jaccard_distance_sum.columns, index=jaccard_distance_sum.index)

addition_nut_jac = addition_nut_jac.sort_values(
    by='jaccard_distance_sum')

addition_nut_jac.rename(
    columns={'jaccard_distance_sum': 'robust_scaling'}, inplace=True)

addition_nut_jac

In [None]:
# robust-normalisierung
from sklearn import preprocessing

nut_res_robust = euclid_distance_sum
x = nut_res_robust.values
robust_scaler = preprocessing.RobustScaler()
x_scaled = robust_scaler.fit_transform(x)
nut_res_robust = pd.DataFrame(
    x_scaled, columns=nut_res_robust.columns, index=nut_res_robust.index)

jac_res_robust = jaccard_distance_sum
z = jac_res_robust.values
robust_scaler = preprocessing.RobustScaler()
z_scaled = robust_scaler.fit_transform(z)

jac_res_robust = pd.DataFrame(
    z_scaled, columns=jac_res_robust.columns, index=jac_res_robust.index)


robust_scaling_nut_recipe = pd.DataFrame(nut_res_robust.euclid_distance_sum +
                                         jac_res_robust.jaccard_distance_sum, columns=jac_res_robust.columns, index=jac_res_robust.index)

robust_scaling_nut_recipe = robust_scaling_nut_recipe.sort_values(
    by='jaccard_distance_sum')

robust_scaling_nut_recipe.rename(
    columns={'jaccard_distance_sum': 'robust_scaling'}, inplace=True)

In [None]:
# minmax nomralisierung
from sklearn import preprocessing

nut_res = euclid_distance_sum
x = nut_res.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
nut_res = pd.DataFrame(x_scaled, columns=nut_res.columns, index=nut_res.index)


jac_res = jaccard_distance_sum
z = jac_res.values
min_max_scaler = preprocessing.MinMaxScaler()
z_scaled = min_max_scaler.fit_transform(z)
jac_res = pd.DataFrame(z_scaled, columns=jac_res.columns, index=jac_res.index)

min_max_nut_recipe = pd.DataFrame(
    nut_res.euclid_distance_sum * jac_res.jaccard_distance_sum, columns=jac_res.columns, index=jac_res.index)

min_max_nut_recipe = min_max_nut_recipe.sort_values(by='jaccard_distance_sum')

min_max_nut_recipe.rename(
    columns={'jaccard_distance_sum': 'min_max'}, inplace=True)

In [None]:
nut_res
jac_res

In [None]:
min_max_nut_recipe

In [None]:
nut_res

In [None]:
nut_res

In [None]:
nut_res.loc[[24792]]

In [None]:
nut_res_hist = nut_res.hist(column='euclid_distance_sum', bins=20)


In [None]:
nut_res_hist = nut_res.hist(column='euclid_distance_sum', bins=20)


In [None]:
jac_res

In [None]:
jac_res

In [None]:
jac_res.loc[[24792]]

In [None]:
min_max_nut_recipe

In [None]:
jac_res_hist = jac_res.hist(column='jaccard_distance_sum', bins=20)


In [None]:
jac_res_hist = jac_res.hist(column='jaccard_distance_sum', bins=20)


In [None]:
jac_res_hist

In [None]:
# z-score-mormalisierung
z_min = (euclid_distance_sum - euclid_distance_sum.mean()) / \
    euclid_distance_sum.std()

z_jac = (jaccard_distance_sum - jaccard_distance_sum.mean()) / \
    jaccard_distance_sum.std()

z_score_nut_recipe = pd.DataFrame(z_min.euclid_distance_sum + z_jac.jaccard_distance_sum,
                                  columns=euclid_distance_sum.columns, index=z_min.index)

z_score_nut_recipe = z_score_nut_recipe.sort_values(by='euclid_distance_sum')

z_score_nut_recipe.rename(
    columns={'euclid_distance_sum': 'z_score'}, inplace=True)

In [None]:
z_min.index.get_loc(143082)
z_jac.index.get_loc(143082)

In [None]:
z_min
z_jac
z_score_nut_recipe

In [None]:
z_jac