In [1]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [2]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [3]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [4]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [5]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [6]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [7]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [8]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [375]:
# random
subset_random_top10 = nutrition_db2.sample(n=10).copy()

In [376]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=40).copy()
subset_fat_id = subset_fat.index.to_numpy()


In [377]:
subset_normal = nutrition_db2.sample(n=40).copy()
subset_normal_id = subset_normal.index.to_numpy()

In [378]:
subset_normal.index.to_numpy()

array([ 15408, 132703,  22943, 228915,   8887,   9043, 260193, 242134,
        24321, 160605, 236128,  23979, 166992,   8630,   8503,   9032,
        24798,  60598,  45782,  15744,  71378,  66404,  14572,  16819,
       104934,  12720, 214894, 213206,  14498, 230247,  27819,  24679,
        88186, 222182,   8857,   8825,  16907,  47076,  87144, 231030])

In [379]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [380]:
subset_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,278.84579,642.9026,335.66595,42.26027,128.54849,3.54856,37.29622,92.15904,3.83611,59.04993,13.80511,596.82064,34.55547,16.29679,1052.05414,6.59649,0.36822,1072.32985,0.51497,22.88373
std,249.45931,125.97656,13.55759,26.71355,69.54013,3.03595,1.5064,63.90179,1.88121,24.23083,7.61424,199.21026,11.62635,4.21322,740.61116,5.65135,0.3111,1169.65874,0.25374,30.68999
min,19.121,425.0375,315.8582,1.23348,0.0,0.0,35.09536,7.2666,0.52164,20.94894,2.05137,306.43,16.83116,9.92352,233.4636,0.09625,0.01345,1.0352,0.17936,0.07625
25%,71.22708,543.86608,324.39002,21.83227,85.27641,1.21571,36.04333,32.2672,2.63211,45.07391,9.3646,445.94878,25.1582,12.61949,706.10267,3.2933,0.13024,454.00743,0.34604,2.96769
50%,210.5312,639.53615,334.82795,38.98913,114.5299,2.95667,37.20311,85.68218,3.72566,57.06377,12.38319,562.4872,34.6981,15.8874,948.319,5.55381,0.21974,921.0486,0.48498,9.73817
75%,407.86145,717.53677,347.30235,56.67772,154.43465,4.64357,38.58915,151.4604,5.11019,67.38881,16.6823,734.96812,39.64098,19.51527,1313.6065,8.65065,0.61568,1224.667,0.64602,29.28499
max,916.5097,901.7867,359.9821,104.2743,402.3,11.88067,39.99801,207.9386,9.3898,138.7532,47.57578,1031.226,71.14468,26.64622,4604.161,28.13915,1.16834,6331.443,1.57042,129.7068


In [381]:
subset_normal.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,108.57185,398.95954,184.84897,23.7606,96.56104,1.90882,20.53877,46.20198,2.65621,47.15821,13.16269,449.33114,28.1297,7.65389,1022.07267,5.79592,0.22704,1170.7565,0.52554,10.01694
std,120.5612,157.98419,97.41025,21.70243,45.5086,2.56239,10.82336,45.38053,1.51332,21.2382,7.68099,188.05641,10.3986,5.36028,1405.74037,9.46419,0.18725,2247.6721,0.29302,14.61179
min,10.68667,148.1671,21.87967,0.82028,4.4,0.0,2.43107,2.96093,0.46062,12.9429,3.97625,185.5863,8.57564,0.67307,98.1412,0.03,0.03742,0.0,0.13703,0.0
25%,33.89906,299.64795,112.48982,6.20793,65.08656,0.45561,12.49887,9.93446,1.52812,30.59614,7.80524,333.60368,23.12123,4.31925,339.0488,1.44107,0.08892,225.6268,0.30661,1.86165
50%,76.85068,379.5373,171.9977,15.38088,89.23188,0.92057,19.11085,25.49709,2.28266,42.33866,11.06848,402.6791,28.13625,5.89127,622.57415,2.85649,0.14834,518.56985,0.50949,4.2054
75%,142.74542,456.69282,272.93095,39.96668,119.27528,2.69247,30.32567,73.87711,3.38355,63.38322,18.86803,551.6383,32.17254,10.24459,961.48963,5.84077,0.31153,872.83763,0.67548,12.41401
max,643.2615,802.3324,421.8061,92.48895,203.0231,12.80419,46.86734,155.9349,7.50701,104.0607,37.62593,1087.707,59.48167,20.92785,7156.844,56.0957,0.68318,13459.9,1.40227,72.5071


In [382]:
user_recipes_fat = recipe_db[recipe_db.index.isin(subset_fat_id)]
user_recipes_normal = recipe_db[recipe_db.index.isin(subset_normal_id)]

In [383]:
#top10 most common ingredients
#16421    2125 salt
#4342     1506 garlic
#4397     1412 onion
#16406    1185 ground black pepper
#16157    1016 butter
#6307      944 olive oil
#6494      786 skinless bone less chicken breast
#2496      749 water
#16238     574 grated Parmesan cheese
#16317     538 eggs


drop_id_list = [16421, 4342, 4397, 16406, 16157, 6307, 6494, 2496, 16238, 16317]





In [384]:
subset_fat_id

array([219766, 239896,  18059,  56412, 231233,  11990, 161869, 235171,
        23849,  75543, 169974, 158587, 147305, 229885, 235000,  18439,
        21352,  72191,  12066, 234797,  23434,  26299, 237320,  14668,
        18795, 206633,  82487,  36621,  40286,  38004, 221304,  16565,
        60096,  32116,  36766, 167052,  23985,  84774,   8998, 238538])

In [385]:
subset_normal_id

array([ 15408, 132703,  22943, 228915,   8887,   9043, 260193, 242134,
        24321, 160605, 236128,  23979, 166992,   8630,   8503,   9032,
        24798,  60598,  45782,  15744,  71378,  66404,  14572,  16819,
       104934,  12720, 214894, 213206,  14498, 230247,  27819,  24679,
        88186, 222182,   8857,   8825,  16907,  47076,  87144, 231030])

In [386]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)
new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=subset_fat_id)
new_recipe_db_wo_userrecipes_normal = new_recipe_db.drop(axis=0, labels=subset_normal_id)

In [387]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes_fat = user_recipes_fat.drop(axis=1, labels=drop_id_list)
new_user_recipes_normal = user_recipes_normal.drop(axis=1, labels=drop_id_list)

In [667]:
sample_fat = new_user_recipes_fat.sample(n=40).copy()


In [734]:
sample_normal = new_user_recipes_normal.sample(n=20).copy()

In [735]:
sample_normal
sample_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
104934,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23979,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60598,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12720,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9032,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66404,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
236128,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
167052,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
161869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26299,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238538,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [736]:
## Jaccard - fat

result_array = cdist(sample_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=sample_fat.index.values)
# result_2

result_w_filter_10_fat = pd.DataFrame(result_w_filter_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_fat = result_w_filter_10_fat.sort_values(by='jaccard_distance_sum')
result_w_filter_10_fat['jaccard_distance_sum'] = result_w_filter_10_fat['jaccard_distance_sum'].div(20)
result_w_filter_10_fat[0:10]


Unnamed: 0,jaccard_distance_sum
238691,1.85338
11735,1.85338
16998,1.85338
19422,1.85483
18805,1.8576
18349,1.86531
11751,1.86735
230283,1.86774
12016,1.87075
102393,1.87075


In [737]:
## Jaccard - normal

result_array = cdist(sample_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=sample_normal.index.values)
# result_2

result_w_filter_10_normal = pd.DataFrame(result_w_filter_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10_normal = result_w_filter_10_normal.sort_values(by='jaccard_distance_sum')
result_w_filter_10_normal['jaccard_distance_sum'] = result_w_filter_10_normal['jaccard_distance_sum'].div(20)
result_w_filter_10_normal[0:10]


Unnamed: 0,jaccard_distance_sum
228546,0.92381
8668,0.92566
17392,0.9278
60631,0.92897
8901,0.93132
15181,0.93381
240287,0.93383
8796,0.93486
24793,0.93646
140829,0.9366


In [738]:
# userrecipes fat
sample_fat.index.to_numpy()

array([ 12066,   8998, 167052,  18795,  82487, 161869,  26299,  75543,
       238538,  21352, 158587, 235171,  23849, 221304,  18059, 169974,
        72191, 206633,  36621,  23985,  56412,  36766,  14668, 229885,
       239896, 237320, 234797,  40286, 231233, 219766,  38004,  16565,
        32116,  11990,  84774,  23434, 147305,  18439, 235000,  60096])

In [739]:
# userrecipes normal
sample_normal.index.to_numpy()


array([104934,  23979,  14572,  22943,  60598,  12720,   9032,  66404,
        87144, 236128,  47076,  88186,  16907,   8630,   8887, 242134,
        45782, 132703, 213206,   8825])

In [740]:
result_w_filter_10_fat[0:10].index.to_numpy()

array([238691,  11735,  16998,  19422,  18805,  18349,  11751, 230283,
        12016, 102393])

In [741]:
result_w_filter_10_normal[0:10].index.to_numpy()

array([228546,   8668,  17392,  60631,   8901,  15181, 240287,   8796,
        24793, 140829])

In [742]:
result_w_filter_fat_short = result_w_filter_fat.copy()
result_w_filter_normal_short = result_w_filter_normal.copy()

In [743]:
# top 10 fat short
fat_short = pd.DataFrame(np.sort(result_w_filter_fat_short.values, axis=0), index=result_w_filter_fat_short.index, columns=result_w_filter_fat_short.columns)

new_columns = fat_short.columns[fat_short.loc[fat_short.first_valid_index()].argsort()]
result_fat_short = fat_short[new_columns]
result_fat_short = result_fat_short.reset_index()
result_fat_short = result_fat_short.loc[0].to_frame()
result_fat_short[0:11].index.to_numpy()


array(['index', 89261, 19423, 195045, 87624, 216231, 8621, 22725, 8734,
       8715, 14140], dtype=object)

In [744]:
fat_short[new_columns]

Unnamed: 0,89261,19423,195045,87624,216231,8621,22725,8734,8715,14140,...,23380,21006,8680,230901,8935,45511,23562,23619,73110,17576
12066,0.0,0.25,0.33333,0.33333,0.4,0.4,0.4,0.4,0.4,0.4,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8998,0.875,0.85714,0.5,0.625,0.5,0.875,0.6,0.88889,0.875,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
167052,1.0,0.875,0.71429,0.75,0.66667,0.88889,0.625,0.88889,0.88889,0.625,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
18795,1.0,0.9,0.8,0.81818,0.77778,0.90909,0.66667,0.9,0.90909,0.66667,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
82487,1.0,0.92308,0.83333,0.9,0.875,0.91667,0.71429,1.0,0.91667,0.71429,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
161869,1.0,1.0,0.88889,0.90909,0.88889,0.92857,0.72727,1.0,0.92857,0.72727,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
26299,1.0,1.0,0.9,0.91667,0.91667,0.93333,0.875,1.0,0.93333,0.875,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75543,1.0,1.0,0.90909,0.92308,1.0,0.93333,0.9,1.0,0.93333,0.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
238538,1.0,1.0,0.94444,0.92857,1.0,1.0,0.90909,1.0,1.0,0.90909,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
21352,1.0,1.0,1.0,0.9375,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [745]:
sample_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
167052,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
161869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26299,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238538,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [746]:
# top 10 normal short
normal_short = pd.DataFrame(np.sort(result_w_filter_normal_short.values, axis=0), index=result_w_filter_normal_short.index, columns=result_w_filter_normal_short.columns)

new_columns = normal_short.columns[normal_short.loc[normal_short.first_valid_index()].argsort()]
result_normal_short = normal_short[new_columns]
result_normal_short = result_normal_short.reset_index()
result_normal_short = result_normal_short.loc[0].to_frame()
result_normal_short[0:11].index.to_numpy()


array(['index', 34116, 228546, 214413, 8974, 134137, 214479, 240287,
       218093, 16812, 54196], dtype=object)

In [747]:
normal_short[new_columns]

Unnamed: 0,34116,228546,214413,8974,134137,214479,240287,218093,16812,54196,...,23021,23039,228498,94894,23070,23097,228431,94113,25416,25432
104934,0.25,0.28571,0.33333,0.33333,0.33333,0.4,0.42857,0.44444,0.5,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
23979,0.88889,0.8,1.0,0.875,0.88889,0.9,0.77778,0.85714,0.875,0.875,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
14572,0.88889,0.875,1.0,0.88889,0.90909,0.9,0.86667,0.92308,0.93333,0.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
22943,1.0,0.875,1.0,0.9,1.0,0.91667,0.875,0.92308,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
60598,1.0,0.88889,1.0,0.91667,1.0,1.0,0.88889,0.9375,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
12720,1.0,0.9,1.0,0.92308,1.0,1.0,0.91667,0.94737,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9032,1.0,0.92308,1.0,0.9375,1.0,1.0,0.92308,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
66404,1.0,0.92857,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
87144,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
236128,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [748]:
sample_fat.index.to_numpy()

array([ 12066,   8998, 167052,  18795,  82487, 161869,  26299,  75543,
       238538,  21352, 158587, 235171,  23849, 221304,  18059, 169974,
        72191, 206633,  36621,  23985,  56412,  36766,  14668, 229885,
       239896, 237320, 234797,  40286, 231233, 219766,  38004,  16565,
        32116,  11990,  84774,  23434, 147305,  18439, 235000,  60096])

In [749]:
sample_normal.index.to_numpy()

array([104934,  23979,  14572,  22943,  60598,  12720,   9032,  66404,
        87144, 236128,  47076,  88186,  16907,   8630,   8887, 242134,
        45782, 132703, 213206,   8825])

In [750]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_fat = sample_fat.copy()
user_one_vector_fat.loc['sum',:]= user_one_vector_fat.sum(axis=0)
user_one_vector_fat = user_one_vector_fat.drop(axis=0, labels=sample_fat.index)
user_one_vector_fat[user_one_vector_fat > 0] = 1

In [751]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector_normal = sample_normal.copy()
user_one_vector_normal.loc['sum',:]= user_one_vector_normal.sum(axis=0)
user_one_vector_normal = user_one_vector_normal.drop(axis=0, labels=sample_normal.index)
user_one_vector_normal[user_one_vector_normal > 0] = 1

In [752]:
user_one_vector_normal
user_one_vector_normal.apply(pd.value_counts).count(axis=1)

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0.00000    626
1.00000     94
dtype: int64

In [753]:
user_one_vector_fat
user_one_vector_fat.apply(pd.value_counts).count(axis=1)

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0.00000    559
1.00000    161
dtype: int64

In [754]:
## Jaccard - Vector fat top 10

result_array = cdist(user_one_vector_fat, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_vector_fat = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=user_one_vector_fat.index.values)


result_w_filter_vector_10_fat = pd.DataFrame(result_w_filter_vector_fat.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_fat = result_w_filter_vector_10_fat.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_fat[0:10]


Unnamed: 0,jaccard_distance_sum
13905,0.92593
180905,0.92638
236805,0.92727
76373,0.92814
87137,0.93252
9005,0.93252
231396,0.93293
111905,0.93333
236781,0.93333
23981,0.93789


In [755]:
## Jaccard - Vector normal top 10

result_array = cdist(user_one_vector_normal, new_recipe_db_wo_userrecipes_normal,'jaccard')
result_w_filter_vector_normal = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_normal.index.values, index=user_one_vector_normal.index.values)


result_w_filter_vector_10_normal = pd.DataFrame(result_w_filter_vector_normal.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10_normal = result_w_filter_vector_10_normal.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10_normal[0:10]


Unnamed: 0,jaccard_distance_sum
60631,0.90426
31988,0.90722
13905,0.90816
16220,0.90816
52299,0.91089
231706,0.91579
145843,0.91579
228917,0.91667
41690,0.91667
139915,0.91667


In [756]:
result_w_filter_vector_10_normal[0:10].index.to_numpy()

array([ 60631,  31988,  13905,  16220,  52299, 231706, 145843, 228917,
        41690, 139915])

In [757]:
####### results

In [758]:
# userrecipes fat
sample_fat.index.to_numpy()

array([ 12066,   8998, 167052,  18795,  82487, 161869,  26299,  75543,
       238538,  21352, 158587, 235171,  23849, 221304,  18059, 169974,
        72191, 206633,  36621,  23985,  56412,  36766,  14668, 229885,
       239896, 237320, 234797,  40286, 231233, 219766,  38004,  16565,
        32116,  11990,  84774,  23434, 147305,  18439, 235000,  60096])

In [759]:
###### jaccard fat top 10 naiv
result_w_filter_10_fat[0:10].index.to_numpy()

array([238691,  11735,  16998,  19422,  18805,  18349,  11751, 230283,
        12016, 102393])

In [760]:
###### jaccard kurz fat top 10
result_fat_short[0:11].index.to_numpy()


array(['index', 89261, 19423, 195045, 87624, 216231, 8621, 22725, 8734,
       8715, 14140], dtype=object)

In [761]:
###### jaccard vector fat top 10
result_w_filter_vector_10_fat[0:10].index.to_numpy()

array([ 13905, 180905, 236805,  76373,  87137,   9005, 231396, 111905,
       236781,  23981])

In [762]:
# userrecipes normal
sample_normal.index.to_numpy()

array([104934,  23979,  14572,  22943,  60598,  12720,   9032,  66404,
        87144, 236128,  47076,  88186,  16907,   8630,   8887, 242134,
        45782, 132703, 213206,   8825])

In [763]:
###### jaccard normal top 10 naiv
result_w_filter_10_normal[0:10].index.to_numpy()

array([228546,   8668,  17392,  60631,   8901,  15181, 240287,   8796,
        24793, 140829])

In [764]:
###### jaccard kurz normal top 10
result_normal_short[0:11].index.to_numpy()


array(['index', 34116, 228546, 214413, 8974, 134137, 214479, 240287,
       218093, 16812, 54196], dtype=object)

In [765]:
###### jaccard vector normal top 10
result_w_filter_vector_10_normal[0:10].index.to_numpy()

array([ 60631,  31988,  13905,  16220,  52299, 231706, 145843, 228917,
        41690, 139915])

In [766]:
##### random top 10
subset_random_top10.index.to_numpy()

array([ 12043, 214780,   8543,  34357, 214479,  49925, 219064,   8823,
        16767,  46402])