In [2]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [3]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [4]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [5]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [6]:
#ingredients
#ingredients.loc[ingredients['id'] == 9380]

In [7]:
#ingredients.loc[ingredients['ingredients_id'] == 2972]

In [8]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [9]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [10]:
nutrition_db.index.nunique()
ingredients_db.index.get_level_values('id').nunique()


4061

4061

In [11]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [12]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [13]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=20).copy()

subset_fat_20 = subset_fat.index.to_numpy()
id_list = subset_fat_20.copy()

In [14]:
subset_random = nutrition_db2.sample(n=10).copy()
subset_random = subset_random.index.to_numpy()

In [15]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [16]:
user_recipes = recipe_db[recipe_db.index.isin(id_list)]

In [17]:
drop_id_list = [4342, 6307, 6494, 16157, 16278, 16421, 4397]

#4342, 6307, 6494, 16157, 16278, 16421, 4397
#garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
#>= 5 mal vorkommen in der top 10 rezept liste

In [18]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)
new_recipe_db_wo_userrecipes = new_recipe_db.drop(axis=0, labels=subset_fat_20)


In [19]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes = user_recipes.drop(axis=1, labels=drop_id_list)

In [20]:
## Jaccard - recipe to recipe similarity without any removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_recipes, recipe_db,'jaccard')
result_wo_filter = pd.DataFrame(result_array, columns=recipe_db.index.values, index=user_recipes.index.values)
# result_2

result_wo_filter_10 = pd.DataFrame(result_wo_filter.sum(), columns=['jaccard_distance_sum'])
result_wo_filter_10 = result_wo_filter_10.sort_values(by='jaccard_distance_sum')
result_wo_filter_10['jaccard_distance_sum'] = result_wo_filter_10['jaccard_distance_sum'].div(20)
result_wo_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
84044,0.83899
71291,0.8413
23849,0.85383
135645,0.85572
52464,0.85727
22353,0.85916
16388,0.86233
229277,0.86454
162690,0.8661
37076,0.86629


In [21]:
pd.Series(np.intersect1d(id_list,result_wo_filter_10[0:10].index.to_numpy()))


0     23849
1     84044
2    229277
dtype: int64

In [22]:
## Jaccard - recipe to recipe similarity without any removal of basis ingredients 

result_array = cdist(new_user_recipes, new_recipe_db,'jaccard')
result_wo_basis_filter = pd.DataFrame(result_array, columns=new_recipe_db.index.values, index=new_user_recipes.index.values)
# result_2

result_wo_basis_filter_10 = pd.DataFrame(result_wo_basis_filter.sum(), columns=['jaccard_distance_sum'])
result_wo_basis_filter_10 = result_wo_basis_filter_10.sort_values(by='jaccard_distance_sum')
result_wo_basis_filter_10['jaccard_distance_sum'] = result_wo_basis_filter_10['jaccard_distance_sum'].div(20)
result_wo_basis_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
231233,0.89992
84044,0.9056
23849,0.90786
216026,0.91243
229277,0.91444
14735,0.9167
164208,0.91794
12562,0.91822
8693,0.91846
21541,0.91883


In [23]:
result_wo_basis_filter_10[0:10].index.to_numpy()

array([231233,  84044,  23849, 216026, 229277,  14735, 164208,  12562,
         8693,  21541])

In [24]:
id_list

array([ 86860,  18093,  18397,  20312, 231233, 164208,  23849,   8639,
         8693,  14735,  84044, 216026, 231537, 219766,  48873, 169322,
        12562,  70051, 229277, 237320])

In [25]:
pd.Series(np.intersect1d(id_list,result_wo_basis_filter_10[0:10].index.to_numpy()))


0      8693
1     12562
2     14735
3     23849
4     84044
5    164208
6    216026
7    229277
8    231233
dtype: int64

In [26]:
## Jaccard - recipe to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(new_user_recipes, new_recipe_db_wo_userrecipes,'jaccard')
result_w_filter = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes.index.values, index=new_user_recipes.index.values)
# result_2

result_w_filter_10 = pd.DataFrame(result_w_filter.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10 = result_w_filter_10.sort_values(by='jaccard_distance_sum')
result_w_filter_10['jaccard_distance_sum'] = result_w_filter_10['jaccard_distance_sum'].div(20)
result_w_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
21541,0.91883
87025,0.9202
15196,0.92378
71291,0.92413
23058,0.92473
240619,0.92544
11832,0.92674
229994,0.93067
8691,0.9309
37784,0.93174


In [27]:
result_w_filter_10[0:10].index.to_numpy()

array([ 21541,  87025,  15196,  71291,  23058, 240619,  11832, 229994,
         8691,  37784])

In [28]:
id_list

array([ 86860,  18093,  18397,  20312, 231233, 164208,  23849,   8639,
         8693,  14735,  84044, 216026, 231537, 219766,  48873, 169322,
        12562,  70051, 229277, 237320])

In [29]:
pd.Series(np.intersect1d(id_list,result_w_filter_10[0:10].index.to_numpy()))


Series([], dtype: int64)

In [30]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector = new_user_recipes.copy()
user_one_vector.loc['sum',:]= user_one_vector.sum(axis=0)
user_one_vector = user_one_vector.drop(axis=0, labels=subset_fat_20)
user_one_vector[user_one_vector > 0] = 1

In [31]:
#show number of ingredients
user_one_vector.apply(pd.value_counts).count(axis=1)

0.00000    623
1.00000    100
dtype: int64

In [38]:
## Jaccard - user vector to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_one_vector, new_recipe_db_wo_userrecipes,'jaccard')
result_w_filter_vector = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes.index.values, index=user_one_vector.index.values)


result_w_filter_vector_10 = pd.DataFrame(result_w_filter_vector.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10 = result_w_filter_vector_10.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10[0:20]


Unnamed: 0,jaccard_distance_sum
172958,0.88571
8697,0.89109
245362,0.89216
231396,0.89423
25300,0.90196
11688,0.90291
31988,0.90385
83083,0.90385
24085,0.90476
154183,0.90476


In [33]:
pd.Series(np.intersect1d(id_list, result_w_filter_vector_10[0:10].index.to_numpy()))


Series([], dtype: int64)

In [37]:
result_w_filter_vector_10[0:20].index.to_numpy()

array([172958,   8697, 245362, 231396,  25300,  11688,  31988,  83083,
        24085, 154183, 237807, 229289, 228446, 239867, 228680,   8758,
        22729, 255038,  34413,  13905])

In [35]:
id_list

array([ 86860,  18093,  18397,  20312, 231233, 164208,  23849,   8639,
         8693,  14735,  84044, 216026, 231537, 219766,  48873, 169322,
        12562,  70051, 229277, 237320])

In [36]:
# Euklidische Distanz - rezept zu rezept vergelich
# minkowski(u, v, p=2, w=None)
# rec = []
# rec = {
#     "Calcium":[1],
#     "Calories":[1],
#     "Calories from Fat":[1],
#     "Carbohydrates":[1],
#     "Cholesterol":[1],
#     "Dietary Fiber":[1],
#     "Fat":[1],
#     "Folate":[1],
#     "Iron":[1],
#     "Magnesium":[1],
#     "Niacin Equivalents":[1],
#     "Potassium":[1],
#     "Protein":[1],
#     "Saturated Fat":[1],
#     "Sodium":[1],
#     "Sugars":[1],
#     "Thiamin":[1],
#     "Vitamin A - IU":[1],
#     "Vitamin B6":[1],
#     "Vitamin C":[1]}

rec_2 = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    1,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    1]

result_array = cdist(user_nutrition, nutrition_db, 'minkowski', p=2, w=rec_2)
# result_array = cdist(user_nutrition, nutrition_db,'minkowski', p = 2)
euclid_distance = pd.DataFrame(
    result_array, columns=nutrition_db.index.values, index=user_nutrition.index.values)

euclid_distance_sum = pd.DataFrame(
    euclid_distance.sum(), columns=['euclid_distance_sum'])
euclid_distance_sum = euclid_distance_sum.sort_values(by='euclid_distance_sum')
euclid_distance_sum

NameError: name 'user_nutrition' is not defined