In [251]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [252]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [253]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [254]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [255]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samt ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)


In [256]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind


In [257]:


ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)



In [258]:
new_ingredients = ingredients_db.copy()
#new_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index()
gramms_ingredients = new_ingredients.groupby("id")["ingredients_grams"].sum().reset_index().copy()

Q1 = gramms_ingredients.quantile(0.25)
Q3 = gramms_ingredients.quantile(0.75)
IQR = Q3 - Q1

#Filter out all recipes which are outlier by their weight (gramms)
df = gramms_ingredients[~((gramms_ingredients < (Q1 - 1.5 * IQR)) |(gramms_ingredients > (Q3 + 1.5 * IQR))).any(axis=1)].copy()

#filter out recipes by weight which are not in the range 500 - 2373.59 gramms
df_start_at_fivehundret = df[df['ingredients_grams'].between(500, 2373.58225, inclusive=False)].copy()

df_start_at_fivehundret.set_index('id', inplace=True)
id_overlap_mask = nutritions['id'].isin(df_start_at_fivehundret.index.get_level_values('id').values)


# erstelle datenframe auf basis der overlapliste
nutritions_filt_gramm = nutritions.loc[id_overlap_mask]

nutrition_db2 = nutritions_filt_gramm.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

#create new nutrition db based on the above filtering
nutrition_db2.set_index('id', inplace=True)
nutrition_db2.columns = nutrition_db2.columns.droplevel(0)



In [495]:
#select 20 random recipes which are between the Fat-range of 35-40 gramms
subset_fat = nutrition_db2[(nutrition_db2['Fat'] >= 35) & (nutrition_db2['Fat'] <= 40)].sample(n=40).copy()




In [497]:
subset_normal = nutrition_db2.sample(n=20).copy()
subset_normal_id = subset_normal.index.to_numpy()

In [498]:
subset_fat_id = subset_fat.index.to_numpy()
subset_normal.index.to_numpy()

array([ 81051,  84737,  85375,  71251, 222079, 214188,  13420,  34942,
        34361,  26601, 233391,   8886,  72133,  11739,  20921, 235158,
        68578, 185519,  24783, 213701])

In [499]:
# remove recipes which are missing in nutrition df from the ingredients tecipes dataframe
mask = recipe_db.index.isin(nutrition_db2.index.values)
recipe_db = recipe_db.loc[mask]


In [500]:
subset_fat.describe()

name,Calcium,Calories,Calories from Fat,Carbohydrates,Cholesterol,Dietary Fiber,Fat,Folate,Iron,Magnesium,Niacin Equivalents,Potassium,Protein,Saturated Fat,Sodium,Sugars,Thiamin,Vitamin A - IU,Vitamin B6,Vitamin C
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,187.05522,624.57142,335.74093,38.5982,119.09344,3.65197,37.30455,79.06782,3.83121,63.69323,14.77116,703.21881,33.39013,15.22493,905.67327,5.76267,0.39613,1557.70814,0.62261,23.62058
std,184.02054,149.19968,13.59677,30.90561,44.28798,3.23066,1.51075,69.85221,2.19255,30.08197,7.12187,290.60403,10.59393,5.14999,609.31445,6.87882,0.32872,1777.73681,0.24946,31.58551
min,19.9788,390.7198,316.8087,1.52447,41.7375,0.0,35.20097,3.39608,0.52164,3.57022,0.1561,287.9466,14.9787,5.39091,176.4401,0.09294,0.00777,1.0352,0.01814,1.468
25%,52.60393,499.92448,322.2759,7.63163,85.63534,1.31421,35.80844,23.38468,2.08828,40.90641,9.71164,489.90708,24.82938,11.87391,485.23282,2.20282,0.13024,506.5445,0.45036,4.15122
50%,114.51445,601.08895,337.3887,32.9774,115.6552,3.19061,37.48763,43.79744,3.57956,61.88055,13.09494,650.9013,33.88768,14.36562,737.89765,3.42023,0.30761,967.06095,0.58153,11.58257
75%,268.8609,729.82555,346.13813,61.61826,145.95542,5.12797,38.45979,150.71375,5.18045,84.08676,19.42007,911.7597,39.9685,18.68514,1199.235,6.03771,0.64958,1479.07775,0.7756,29.04101
max,754.3973,913.1966,358.1345,104.2743,235.151,12.12185,39.79272,220.8087,9.3898,134.5539,31.50643,1642.204,56.95351,26.64622,3117.706,35.81975,1.16834,7995.903,1.2931,147.357


In [501]:
user_recipes_fat = recipe_db[recipe_db.index.isin(subset_fat_id)]
user_recipes_normal = recipe_db[recipe_db.index.isin(subset_normal_id)]

In [502]:
#top10 most common ingredients
#16421    2125 salt
#4342     1506 garlic
#4397     1412 onion
#16406    1185 ground black pepper
#16157    1016 butter
#6307      944 olive oil
#6494      786 skinless bone less chicken breast
#2496      749 water
#16238     574 grated Parmesan cheese
#16317     538 eggs


drop_id_list = [16421, 4342, 4397, 16406, 16157, 6307, 6494, 2496, 16238, 16317]





In [503]:
subset_fat_id

array([ 48873,  16966,  56412,   8556,  16682,  21352,  14735,  42967,
        17496, 213140, 231537,  84774, 219173,  26299,   8679, 132703,
        30007, 234797, 235000,  38028,   8630,  76808, 202881,  82487,
        14668,  16260, 229247, 169322,  22702,  87137,  25137, 140135,
       149738,  76763,  72191,  62706, 214785,  40286,  18397, 164208])

In [504]:
subset_normal_id

array([ 81051,  84737,  85375,  71251, 222079, 214188,  13420,  34942,
        34361,  26601, 233391,   8886,  72133,  11739,  20921, 235158,
        68578, 185519,  24783, 213701])

In [505]:
#filter out ingredients which are too common and the recipes in the user profile from the recipes database
new_recipe_db = recipe_db.drop(axis=1, labels=drop_id_list)
new_recipe_db_wo_userrecipes_fat = new_recipe_db.drop(axis=0, labels=subset_fat_id)
new_recipe_db_wo_userrecipes_normal = new_recipe_db.drop(axis=0, labels=subset_normal_id)

In [506]:
#remove basic ingredients like garlic, olive oil, skinless bone less chicken breast, butter, milk, salt and pepper, onion
new_user_recipes_fat = user_recipes_fat.drop(axis=1, labels=drop_id_list)
new_user_recipes_normal = user_recipes_normal.drop(axis=1, labels=drop_id_list)

In [507]:
asd = new_user_recipes_fat

In [590]:
new_user_recipes_fat

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8630,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14668,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14735,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [621]:
sample = new_user_recipes_fat.sample(n=5)

In [592]:
sample

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22702,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
164208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
169322,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62706,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [593]:
## Jaccard - recipe to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(sample, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=sample.index.values)
# result_2

result_w_filter_10 = pd.DataFrame(result_w_filter.sum(), columns=['jaccard_distance_sum'])
result_w_filter_10 = result_w_filter_10.sort_values(by='jaccard_distance_sum')
result_w_filter_10['jaccard_distance_sum'] = result_w_filter_10['jaccard_distance_sum'].div(20)
result_w_filter_10[0:10]


Unnamed: 0,jaccard_distance_sum
8796,1.88945
231396,1.89496
255038,1.89589
22991,1.89667
11905,1.89903
18349,1.90008
23181,1.90011
19128,1.90161
228680,1.90295
17869,1.9061


In [594]:
sample.index.to_numpy()

array([ 16682,  17496,  16260,  25137,  87137,  22702, 164208, 169322,
        62706,  30007,   8679,  40286,  26299,  14668,   8630, 213140,
         8556,  84774,  38028,  56412,  48873, 214785,  14735, 132703,
       231537,  82487, 219173,  76808,  76763,  16966, 229247, 202881,
       234797,  72191,  42967,  18397,  21352, 235000, 149738, 140135])

In [595]:
new_user_recipes_fat[0:20].index.to_numpy()

array([ 8556,  8630,  8679, 14668, 14735, 16260, 16682, 16966, 17496,
       18397, 21352, 22702, 25137, 26299, 30007, 38028, 40286, 42967,
       48873, 56412])

In [609]:
result_w_filter_10[0:10].index.to_numpy()

array([  8796, 231396, 255038,  22991,  11905,  18349,  23181,  19128,
       228680,  17869])

In [597]:
result_w_filter_test = result_w_filter.copy()

In [598]:
result_w_filter_test

Unnamed: 0,7198,8493,8494,8495,8496,8497,8498,8500,8503,8506,...,254874,254940,255038,255263,255545,255936,257312,260193,261124,263813
16682,1.0,1.0,1.0,1.0,1.0,1.0,0.85714,1.0,1.0,0.9,...,1.0,1.0,0.92857,0.875,0.92308,1.0,1.0,1.0,1.0,0.90909
17496,1.0,1.0,1.0,1.0,1.0,0.90909,0.8,0.92308,1.0,1.0,...,1.0,0.90909,0.8125,0.91667,0.875,1.0,0.91667,1.0,1.0,0.93333
16260,1.0,1.0,1.0,1.0,1.0,1.0,0.875,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,1.0
25137,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.92857,1.0,...,1.0,1.0,1.0,1.0,0.94444,1.0,0.92308,1.0,1.0,0.9375
87137,1.0,0.9375,1.0,1.0,1.0,1.0,0.9375,1.0,0.94444,0.94737,...,0.9375,1.0,0.95652,1.0,1.0,1.0,0.94118,1.0,0.94444,0.95
22702,1.0,0.875,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
164208,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,...,1.0,1.0,1.0,1.0,1.0,0.90909,1.0,1.0,1.0,1.0
169322,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.9375,1.0,0.93333,1.0,1.0,1.0,1.0,1.0
62706,1.0,1.0,0.93333,1.0,1.0,1.0,1.0,0.8,1.0,1.0,...,1.0,1.0,0.9375,0.9,0.93333,1.0,1.0,1.0,1.0,1.0
30007,1.0,1.0,0.93333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.92308


In [599]:
df1 = pd.DataFrame(np.sort(result_w_filter_test.values, axis=0), index=result_w_filter_test.index, columns=result_w_filter_test.columns)

In [600]:
new_columns = df1.columns[df1.ix[df1.first_valid_index()].argsort()]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [601]:
test = df1[new_columns]

In [602]:
test = test.reset_index()

In [603]:
test

Unnamed: 0,index,16470,228546,195045,229994,222582,240287,216231,23985,36944,...,229099,58431,8919,87053,8918,58831,22297,16446,8935,22204
0,16682,0.25,0.28571,0.33333,0.33333,0.42857,0.42857,0.5,0.5,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,17496,0.8,0.88889,0.8,0.8125,0.81818,0.875,0.77778,0.77778,0.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,16260,0.85714,0.9,0.88889,0.90476,0.88889,0.88889,0.85714,0.875,0.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,25137,0.91667,0.9,0.9,0.92308,0.9,0.88889,0.875,0.88889,0.92308,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,87137,0.91667,0.90909,0.90909,0.92308,0.90909,0.9,0.88889,0.91667,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,22702,0.92308,0.90909,0.91667,0.92857,0.91667,0.9,0.91667,0.92308,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,164208,0.92308,0.90909,0.92308,0.92857,0.92308,0.90909,0.92308,0.9375,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,169322,0.92308,0.91667,0.92857,0.92857,0.92857,0.91667,0.9375,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,62706,0.92857,0.91667,0.94118,0.92857,0.93333,0.91667,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,30007,0.93333,0.92308,0.94444,0.9375,0.94444,0.92857,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [604]:
asd[0:20]

Unnamed: 0,0
index,76763.0
195045,0.33333
222582,0.42857
13981,0.5
23985,0.5
36944,0.5
216231,0.5
223360,0.55556
11815,0.55556
87168,0.55556


In [605]:
asd = test.loc[0].to_frame()

In [606]:
asd[0:11].index.to_numpy()


array(['index', 16470, 228546, 195045, 229994, 222582, 240287, 216231,
       23985, 36944, 13981], dtype=object)

In [607]:
subset_normal_id[0:10]

array([ 81051,  84737,  85375,  71251, 222079, 214188,  13420,  34942,
        34361,  26601])

In [608]:
subset_fat_id

array([ 48873,  16966,  56412,   8556,  16682,  21352,  14735,  42967,
        17496, 213140, 231537,  84774, 219173,  26299,   8679, 132703,
        30007, 234797, 235000,  38028,   8630,  76808, 202881,  82487,
        14668,  16260, 229247, 169322,  22702,  87137,  25137, 140135,
       149738,  76763,  72191,  62706, 214785,  40286,  18397, 164208])

In [636]:
sample.index.to_numpy()

array([  8630, 202881, 164208,   8679,  84774])

In [631]:
#create one dimensional ingredients vector from user recipes profile
user_one_vector = sample.copy()
user_one_vector.loc['sum',:]= user_one_vector.sum(axis=0)
user_one_vector = user_one_vector.drop(axis=0, labels=sample.index)
user_one_vector[user_one_vector > 0] = 1

In [632]:
user_one_vector

Unnamed: 0_level_0,111,126,257,443,445,578,615,629,631,858,...,23047,23274,23383,24865,25518,25522,26269,26934,27343,27805
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [634]:
## Jaccard - user vector to recipe similarity with removal of basis ingredients and duplicate userrecipes

result_array = cdist(user_one_vector, new_recipe_db_wo_userrecipes_fat,'jaccard')
result_w_filter_vector = pd.DataFrame(result_array, columns=new_recipe_db_wo_userrecipes_fat.index.values, index=user_one_vector.index.values)


result_w_filter_vector_10 = pd.DataFrame(result_w_filter_vector.sum(), columns=['jaccard_distance_sum'])
result_w_filter_vector_10 = result_w_filter_vector_10.sort_values(by='jaccard_distance_sum')

result_w_filter_vector_10[0:10]


Unnamed: 0,jaccard_distance_sum
87627,0.78125
70447,0.8125
132801,0.83871
223360,0.83871
23849,0.84375
8796,0.84848
23981,0.85714
22478,0.86111
102831,0.86111
20611,0.86667


In [633]:
user_one_vector.apply(pd.value_counts).count(axis=1)

0.00000    690
1.00000     30
dtype: int64

In [635]:

result_w_filter_vector_10[0:10].index.to_numpy()

array([ 87627,  70447, 132801, 223360,  23849,   8796,  23981,  22478,
       102831,  20611])