In [None]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [None]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [None]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))
norm_data = json_normalize(data)

In [None]:
ingredients = json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore')

ingredients = pd.DataFrame(ingredients)

nutritions = json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count'])

In [None]:
# data cleansing

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
ingredients_filt = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) >= 5)

# drop alls rows with ingredients_id == 0
ingredients_filt = ingredients_filt[ingredients_filt.ingredients_id != 0]

# drop all duplicate
ingredients_filt = ingredients_filt.drop_duplicates(keep='first')

In [None]:
id_list = [215014, 8669, 16700, 16354, 12720, 8652, 8887, 51283, 45954, 213742, 14595,
           219164, 16348, 143082, 8665, 11758, 223042, 236609, 8778, 65896, 24264, 11679, 141678, 9023]

recipe_db = pd.get_dummies(ingredients_filt['ingredients_id']).groupby(
    ingredients_filt['id']).apply(max)

In [None]:
# filtere Rezepte aus den Metadaten anhand ihrerr Zutatenmenge raus.
not_wanted_recipes = recipe_db[recipe_db.mask(
    recipe_db == 0).count(axis=1) <= 4]

mask = nutritions['id'].isin(not_wanted_recipes.index.values)

nutritions_filt = nutritions.loc[~mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)
# remove multiindex 'amount'

nutrition_db.columns = nutrition_db.columns.droplevel(0)

nutrition_db = nutrition_db.dropna()

In [None]:
recipe_db_filt = recipe_db[recipe_db.mask(recipe_db == 0).count(axis=1) >= 5]

user_nutrition = nutrition_db[nutrition_db.index.isin(id_list)]

user_recipes = recipe_db_filt[recipe_db_filt.index.isin(id_list)]

In [None]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db_filt.index.isin(nutrition_db.index.values)
recipe_db = recipe_db_filt.loc[mask]

recipe_db

In [None]:
# jaccard Distanz - rezept zu rezept vergleich

result_array = cdist(user_recipes, recipe_db,'jaccard')
jacc_distance = pd.DataFrame(result_array, columns=recipe_db.index.values, index=user_recipes.index.values)

jaccard_distance_sum = pd.DataFrame(jacc_distance.sum(), columns=['jaccard_distance_sum'])
jaccard_distance_sum = jaccard_distance_sum.sort_values(by='jaccard_distance_sum')

jaccard_distance_sum

In [None]:
##Euklidische Distanz - rezept zu rezept vergelich

gesund = [
    #"Calcium":[1],
    1,
   # "Calories":[1],
    1,
   # "Calories from Fat":[1],
    1,
    #"Carbohydrates":[1],
    1,
    #"Cholesterol":[1],
    1,
    #"Dietary Fiber	":[1],
    1,
    #"Fat":[1],
    1,
    #"Folate":[1],
    1,
    #"Iron":[1],
    1,
    #"Magnesium":[1],
    1,
    #"Niacin Equivalents":[1],
    1,
    #"Potassium":[1],
    1,
    #"Protein":[1],
    1,
    #"Saturated Fat":[1],
    1,
    #"Sodium":[1],
    1,
    #"Sugars":[1],
    1,
    #"Thiamin":[1],
    1,
    #"Vitamin A - IU":[1],
    1,
    #"Vitamin B6":[1],
    1,
    #"Vitamin C":[1]}
    1]

result_array = cdist(user_nutrition, nutrition_db,'minkowski', p = 2, w= rec_2)
# result_array = cdist(user_nutrition, nutrition_db,'minkowski', p = 2)
euclid_distance = pd.DataFrame(result_array, columns=nutrition_db.index.values, index=user_nutrition.index.values)

euclid_distance_sum = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum = euclid_distance_sum.sort_values(by='euclid_distance_sum')
euclid_distance_sum

In [None]:
##Euklidische Distanz - rezept zu rezept vergelich
# minkowski(u, v, p=2, w=None)
# rec = []
# rec = {
#     "Calcium":[1],
#     "Calories":[1],
#     "Calories from Fat":[1],
#     "Carbohydrates":[1],
#     "Cholesterol":[1],
#     "Dietary Fiber":[1],
#     "Fat":[1],
#     "Folate":[1],
#     "Iron":[1],
#     "Magnesium":[1],
#     "Niacin Equivalents":[1],
#     "Potassium":[1],
#     "Protein":[1],
#     "Saturated Fat":[1],
#     "Sodium":[1],
#     "Sugars":[1],
#     "Thiamin":[1],
#     "Vitamin A - IU":[1],
#     "Vitamin B6":[1],
#     "Vitamin C":[1]}

rec_2 = [
    #"Calcium":[1],
    1,
   # "Calories":[1],
    1,
   # "Calories from Fat":[1],
    1,
    #"Carbohydrates":[1],
    1,
    #"Cholesterol":[1],
    1,
    #"Dietary Fiber	":[1],
    1,
    #"Fat":[1],
    1,
    #"Folate":[1],
    1,
    #"Iron":[1],
    1,
    #"Magnesium":[1],
    1,
    #"Niacin Equivalents":[1],
    1,
    #"Potassium":[1],
    1,
    #"Protein":[1],
    1,
    #"Saturated Fat":[1],
    1,
    #"Sodium":[1],
    1,
    #"Sugars":[1],
    1,
    #"Thiamin":[1],
    1,
    #"Vitamin A - IU":[1],
    1,
    #"Vitamin B6":[1],
    1,
    #"Vitamin C":[1]}
    1]

result_array = cdist(user_nutrition, nutrition_db,'minkowski', p = 2, w= rec_2)
# result_array = cdist(user_nutrition, nutrition_db,'minkowski', p = 2)
euclid_distance = pd.DataFrame(result_array, columns=nutrition_db.index.values, index=user_nutrition.index.values)

euclid_distance_sum = pd.DataFrame(euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum = euclid_distance_sum.sort_values(by='euclid_distance_sum')
euclid_distance_sum

In [None]:
euclid_distance_sum_no_outliers = euclid_distance_sum
jaccard_distance_sum_no_outliers = jaccard_distance_sum

In [None]:
outliers = euclid_distance_sum[euclid_distance_sum['euclid_distance_sum'] >
                               euclid_distance_sum['euclid_distance_sum'].mean() + 1 * euclid_distance_sum['euclid_distance_sum'].std()]

euclid_distance_sum_no_outliers.drop(outliers.index, inplace=True)

jaccard_distance_sum_no_outliers = jaccard_distance_sum_no_outliers[jaccard_distance_sum_no_outliers.index.isin(
    euclid_distance_sum_no_outliers.index)]

In [None]:


jaccard_distance_sum_no_outliers

In [None]:
euclid_distance_sum_no_outliers.describe()
jaccard_distance_sum_no_outliers.describe()

In [None]:
# copy original data so it wont be altered
euclid_distance_sum_w_outliers = euclid_distance_sum
jaccard_distance_sum_w_outliers = jaccard_distance_sum


In [None]:
# copy original data back
euclid_distance_sum = euclid_distance_sum_w_outliers
jaccard_distance_sum = jaccard_distance_sum_w_outliers


In [None]:
# copy data with outliers
euclid_distance_sum = euclid_distance_sum_no_outliers
jaccard_distance_sum = jaccard_distance_sum_no_outliers

In [None]:
euclid_distance_sum.describe()
jaccard_distance_sum.describe()

In [None]:
# minmax nomralisierung
from sklearn import preprocessing

nut_res = euclid_distance_sum
x = nut_res.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
nut_res = pd.DataFrame(x_scaled, columns=nut_res.columns, index=nut_res.index)


jac_res = jaccard_distance_sum
z = jac_res.values
min_max_scaler = preprocessing.MinMaxScaler()
z_scaled = min_max_scaler.fit_transform(z)
jac_res = pd.DataFrame(z_scaled, columns=jac_res.columns, index=jac_res.index)

min_max_nut_recipe = pd.DataFrame(
    nut_res.euclid_distance_sum + jac_res.jaccard_distance_sum, columns=jac_res.columns, index=jac_res.index)

min_max_nut_recipe = min_max_nut_recipe.sort_values(by='jaccard_distance_sum')

min_max_nut_recipe.rename(
    columns={'jaccard_distance_sum': 'min_max'}, inplace=True)

In [None]:
# robust-normalisierung
from sklearn import preprocessing

nut_res_robust = euclid_distance_sum
x = nut_res_robust.values
robust_scaler = preprocessing.RobustScaler()
x_scaled = robust_scaler.fit_transform(x)
nut_res_robust = pd.DataFrame(
    x_scaled, columns=nut_res_robust.columns, index=nut_res_robust.index)

jac_res_robust = jaccard_distance_sum
z = jac_res_robust.values
robust_scaler = preprocessing.RobustScaler()
z_scaled = robust_scaler.fit_transform(z)

jac_res_robust = pd.DataFrame(
    z_scaled, columns=jac_res_robust.columns, index=jac_res_robust.index)


robust_scaling_nut_recipe = pd.DataFrame(nut_res_robust.euclid_distance_sum +
                                         jac_res_robust.jaccard_distance_sum, columns=jac_res_robust.columns, index=jac_res_robust.index)

robust_scaling_nut_recipe = robust_scaling_nut_recipe.sort_values(by='jaccard_distance_sum')

robust_scaling_nut_recipe.rename(
    columns={'jaccard_distance_sum': 'robust_scaling'}, inplace=True)

In [None]:
# z-score-mormalisierung
z_min = (euclid_distance_sum - euclid_distance_sum.mean()) / \
    euclid_distance_sum.std()

z_jac = (jaccard_distance_sum - jaccard_distance_sum.mean()) / \
    jaccard_distance_sum.std()

z_score_nut_recipe = pd.DataFrame(z_min.euclid_distance_sum + z_jac.jaccard_distance_sum,
                                  columns=euclid_distance_sum.columns, index=z_min.index)

z_score_nut_recipe = z_score_nut_recipe.sort_values(by='euclid_distance_sum')

z_score_nut_recipe.rename(
    columns={'euclid_distance_sum': 'z_score'}, inplace=True)

In [None]:
display(z_score_nut_recipe, robust_scaling_nut_recipe, min_max_nut_recipe)