In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm_notebook as tqdm
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import correlation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Beauty product subset
# ratings_path = './All_Beauty.json'
# meta_path = './meta_All_Beauty.json'
ratings_path = '/content/drive/MyDrive/Colab Notebooks/data/All_Beauty.json'
meta_path = '/content/drive/MyDrive/Colab Notebooks/data/meta_All_Beauty.json'

In [None]:
# Load all data
meta = []
with open(meta_path, 'r') as f:
    for l in tqdm(f):
        meta.append(json.loads(l)) 

In [None]:
len(meta)

32892

In [None]:
meta[0]  # Structure of metadata

{'also_buy': [],
 'also_view': [],
 'asin': '6546546450',
 'brand': 'idea village',
 'category': [],
 'date': '',
 'description': ["Loud 'N Clear Personal Sound Amplifier allows you to turn up the volume on what people around you are saying, listen at the level you want without disturbing others, hear a pin drop from across the room."],
 'details': {'ASIN: ': '6546546450'},
 'feature': [],
 'fit': '',
 'image': [],
 'main_cat': 'All Beauty',
 'price': '',
 'rank': '2,938,573 in Beauty & Personal Care (',
 'similar_item': '',
 'tech1': '',
 'tech2': '',
 'title': "Loud 'N Clear&trade; Personal Sound Amplifier"}

In [None]:
meta_hash = {}
for m in meta:
    keys = list(m['details'].keys())
    for key in keys:
        if key.startswith('ASIN: '):
            if m['details'][key] not in meta_hash:
                meta_hash[m['details'][key]] = m

In [None]:
# get product from product id
def get_product(id: str):
  if id in meta_hash: 
    item = meta_hash[id]
    new_item  = to_product(item)
    return new_item

In [None]:
# product formatter
def to_product(item):
    new_item = {}
    fields = ['title', 'brand', 'main_cat']
    for f in fields:
        new_item[f] = item[f]
    return new_item

In [None]:
get_product('6546546450')

{'brand': 'idea village',
 'main_cat': 'All Beauty',
 'title': "Loud 'N Clear&trade; Personal Sound Amplifier"}

In [None]:
# Load all data
data = []
with open(ratings_path, 'r') as f:
    for l in tqdm(f):
        data.append(json.loads(l))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
len(data)

371345

In [None]:
# Structure of review data
# asin = product_id
data[0]

{'asin': '0143026860',
 'overall': 1.0,
 'reviewText': 'great',
 'reviewTime': '02 19, 2015',
 'reviewerID': 'A1V6B6TNIC10QE',
 'reviewerName': 'theodore j bigham',
 'summary': 'One Star',
 'unixReviewTime': 1424304000,
 'verified': True}

In [None]:
dictionary = {'overall': [], 'reviewerID': [], 'asin': []}
mapping = {'overall': 'rating', 'reviewerID': 'reviewer', 'productID': 'asin'}
for d in data:
    for key in dictionary:
        if key in d:
            dictionary[key].append(d[key])

In [None]:
df = pd.DataFrame(dictionary)
df

Unnamed: 0,overall,reviewerID,asin
0,1.0,A1V6B6TNIC10QE,0143026860
1,4.0,A2F5GHSXFQ0W6J,0143026860
2,4.0,A1572GUYS7DGSR,0143026860
3,5.0,A1PSGLFK1NSVO,0143026860
4,5.0,A6IKXKZMTKGSC,0143026860
...,...,...,...
371340,1.0,A202DCI7TV1022,B01HJEGTYK
371341,5.0,A3FSOR5IJOFIBE,B01HJEGTYK
371342,5.0,A1B5DK6CTP2P24,B01HJEGTYK
371343,2.0,A23OUYS5IRMJS9,B01HJEGTYK


In [None]:
user_criteria = True
product_criteria = True
if user_criteria:
    user_threshold = 1   # only keep users who have rated more than #5 products
    n = df.groupby(['reviewerID']).count()
    reviewer_ids = n[n['asin'] > user_threshold].index
    df = df[df['reviewerID'].isin(reviewer_ids)]
    df.shape

In [None]:
if product_criteria:
    product_threshold = 2 # only keep products which are rated by atleast 5 users
    m = df.groupby(['asin']).count()
    product_ids = m[m['reviewerID'] > product_threshold].index
    df = df[df['asin'].isin(product_ids)]
    df.shape

In [None]:
df

Unnamed: 0,overall,reviewerID,asin
45,2.0,A1Z8A548Z31SUB,1620213982
56,5.0,A1Z7KJ7SBYTDA8,1620213982
69,5.0,A1FYN0MFZ6MQL3,1620213982
76,5.0,A1T2B5PFIP9TY1,1620213982
78,5.0,A2RL2YV966PEF8,1620213982
...,...,...,...
370984,5.0,A1J0BZEQL31JD9,B01HD23OJG
371133,5.0,A33TMCS1GN2716,B01HFUXIGA
371137,5.0,A2YB19E1XIL7Y4,B01HFUXIGA
371138,1.0,A1G6J2NIA6Q6I8,B01HFUXIGA


In [None]:
# average_rating_per_user = df.groupby('reviewerID').mean()
# average_rating_per_user

In [None]:
# def average(row):
#     rid = row['reviewerID']
#     avg = average_rating_per_user.loc[rid]['overall']
#     return avg

In [None]:
# df['average'] = df.apply(average, axis = 1)
# df['normalized'] = (df['overall'] - df['average'])

In [None]:
df

Unnamed: 0,overall,reviewerID,asin
45,2.0,A1Z8A548Z31SUB,1620213982
56,5.0,A1Z7KJ7SBYTDA8,1620213982
69,5.0,A1FYN0MFZ6MQL3,1620213982
76,5.0,A1T2B5PFIP9TY1,1620213982
78,5.0,A2RL2YV966PEF8,1620213982
...,...,...,...
370984,5.0,A1J0BZEQL31JD9,B01HD23OJG
371133,5.0,A33TMCS1GN2716,B01HFUXIGA
371137,5.0,A2YB19E1XIL7Y4,B01HFUXIGA
371138,1.0,A1G6J2NIA6Q6I8,B01HFUXIGA


In [None]:
pt = df.pivot_table(values="overall", index=['reviewerID'], columns=['asin'], fill_value=0)
pt

NameError: ignored

In [None]:
pt_train, pt_test = train_test_split(pt, test_size=0.020)

In [None]:
pt_test

asin,1620213982,9742121109,9790787006,B000050B63,B000050B65,B000050B6B,B000050B6H,B000050FDT,B000050FDY,B000052YAN,B000052YD8,B00005308B,B0000530HU,B00005355V,B00005JS5C,B00005U8U8,B000065AB1,B000067E30,B000068PBJ,B000068PBL,B000068PBM,B000068PBP,B00006IGL2,B00006ISDE,B00006L9LC,B00009RB0X,B00009RB0Y,B00009RB0Z,B00009RB10,B00009RB11,B00009RB1C,B00009RB1E,B00009RB1I,B0000C4COX,B0000C4KJ9,B0000C4KJX,B0000Y3DD2,B0000Y8IOQ,B0000ZHGZ2,B00011QUDE,...,B01GR1U0FK,B01GR3AU12,B01GR4IGSU,B01GR53BYI,B01GS9PKJW,B01GSXET1S,B01GW09XRW,B01GW54IIG,B01GX3DTFU,B01GY3H018,B01GZWUSYY,B01H0Q7AXG,B01H0TJCZM,B01H1JX0U4,B01H2ERF72,B01H2L415O,B01H3IVSAC,B01H3KUIY2,B01H3ZQ2NI,B01H43AIES,B01H43EMF4,B01H5MUJSI,B01H640HTG,B01H6W0UH2,B01H71ND58,B01H71ND76,B01H71NDT4,B01H71NG0K,B01H7QSCVS,B01HAH5ZVY,B01HATTFWW,B01HB4BS1C,B01HBSH2EK,B01HBXID8Y,B01HC6G4D6,B01HC9ONI6,B01HCPNYR6,B01HD23OJG,B01HFUXIGA,B01HI5VPW6
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
AVMJ7CVFSFD3D,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A1U6HDJVU93RP0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A1FWRXN3S3AN8D,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
AJKAA23DBCZE4,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
AADQU89B15IUU,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A3DRVAX8TC5NWR,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A3S5MHB0P2RTI5,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A2Y3JW8L3R0AD6,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A3GYAQD39V16YO,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
pt_csr = csr_matrix(pt_train)

In [None]:
pt_csr

<34162x3059 sparse matrix of type '<class 'numpy.float64'>'
	with 62804 stored elements in Compressed Sparse Row format>

In [None]:
# Starting Point

In [None]:
k = 15
model_knn = NearestNeighbors(metric='manhattan', algorithm='brute', n_neighbors=k, n_jobs=-1)

In [None]:
model_knn.fit(pt_csr)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='manhattan',
                 metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
                 radius=1.0)

In [None]:
# Test on first item in train set. Just for verification

In [None]:
distances, indices = model_knn.kneighbors(pt_csr[0], k)

In [None]:
distances

array([[ 0.,  9.,  9., 14., 14., 15., 15., 15., 15., 15., 15., 15., 15.,
        15., 15.]])

In [None]:
indices

array([[    0, 10532, 10093, 18885, 30675, 12958, 12955,  1564, 19270,
        20348, 16066, 32421, 19278,  6398, 12926]])

In [None]:
def get_non_zero_values(sample):
    n = (sample > 0).any()
    nz_sample = sample[n.index[n]]
    return nz_sample

In [None]:
# Trying on random test data

In [None]:
query_index = np.random.choice(pt_test.shape[0])
print(query_index)
_input = pt_test.iloc[query_index]
_input

337


asin
1620213982    0.0
9742121109    0.0
9790787006    0.0
B000050B63    0.0
B000050B65    0.0
             ... 
B01HC9ONI6    0.0
B01HCPNYR6    0.0
B01HD23OJG    0.0
B01HFUXIGA    0.0
B01HI5VPW6    0.0
Name: A22J92A80WI059, Length: 3059, dtype: float64

In [None]:
sample = _input.to_frame().T
sample

asin,1620213982,9742121109,9790787006,B000050B63,B000050B65,B000050B6B,B000050B6H,B000050FDT,B000050FDY,B000052YAN,B000052YD8,B00005308B,B0000530HU,B00005355V,B00005JS5C,B00005U8U8,B000065AB1,B000067E30,B000068PBJ,B000068PBL,B000068PBM,B000068PBP,B00006IGL2,B00006ISDE,B00006L9LC,B00009RB0X,B00009RB0Y,B00009RB0Z,B00009RB10,B00009RB11,B00009RB1C,B00009RB1E,B00009RB1I,B0000C4COX,B0000C4KJ9,B0000C4KJX,B0000Y3DD2,B0000Y8IOQ,B0000ZHGZ2,B00011QUDE,...,B01GR1U0FK,B01GR3AU12,B01GR4IGSU,B01GR53BYI,B01GS9PKJW,B01GSXET1S,B01GW09XRW,B01GW54IIG,B01GX3DTFU,B01GY3H018,B01GZWUSYY,B01H0Q7AXG,B01H0TJCZM,B01H1JX0U4,B01H2ERF72,B01H2L415O,B01H3IVSAC,B01H3KUIY2,B01H3ZQ2NI,B01H43AIES,B01H43EMF4,B01H5MUJSI,B01H640HTG,B01H6W0UH2,B01H71ND58,B01H71ND76,B01H71NDT4,B01H71NG0K,B01H7QSCVS,B01HAH5ZVY,B01HATTFWW,B01HB4BS1C,B01HBSH2EK,B01HBXID8Y,B01HC6G4D6,B01HC9ONI6,B01HCPNYR6,B01HD23OJG,B01HFUXIGA,B01HI5VPW6
A22J92A80WI059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# non zero entries
nz_sample = get_non_zero_values(sample)
nz_sample

asin,B001F51RAG,B002O2JXWS
A22J92A80WI059,5.0,5.0


In [None]:
hide_index = np.random.randint(len(nz_sample.columns))
hide_index
hide_column = nz_sample.columns[hide_index]
hide_column

'B002O2JXWS'

In [None]:
_input_test = _input.copy()
_input_test.loc[hide_column] = 0

In [None]:
sample_test = _input_test.to_frame().T
sample_test
# non zero entries
nz_sample_test = get_non_zero_values(sample_test)
nz_sample_test

asin,B001F51RAG
A22J92A80WI059,5.0


In [None]:
distances, indices = model_knn.kneighbors(_input_test.values.reshape(1, -1), k)
distances, indices

(array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 2. , 2. , 2. ,
         2. , 3. ]]),
 array([[13573, 31432,  8609,  7166, 23273, 13016, 29214, 12285, 16727,
         12095, 12264, 28245,  9673, 21811, 21386]]))

In [None]:
result = pt_train.iloc[indices[0]]
result

asin,1620213982,9742121109,9790787006,B000050B63,B000050B65,B000050B6B,B000050B6H,B000050FDT,B000050FDY,B000052YAN,B000052YD8,B00005308B,B0000530HU,B00005355V,B00005JS5C,B00005U8U8,B000065AB1,B000067E30,B000068PBJ,B000068PBL,B000068PBM,B000068PBP,B00006IGL2,B00006ISDE,B00006L9LC,B00009RB0X,B00009RB0Y,B00009RB0Z,B00009RB10,B00009RB11,B00009RB1C,B00009RB1E,B00009RB1I,B0000C4COX,B0000C4KJ9,B0000C4KJX,B0000Y3DD2,B0000Y8IOQ,B0000ZHGZ2,B00011QUDE,...,B01GR1U0FK,B01GR3AU12,B01GR4IGSU,B01GR53BYI,B01GS9PKJW,B01GSXET1S,B01GW09XRW,B01GW54IIG,B01GX3DTFU,B01GY3H018,B01GZWUSYY,B01H0Q7AXG,B01H0TJCZM,B01H1JX0U4,B01H2ERF72,B01H2L415O,B01H3IVSAC,B01H3KUIY2,B01H3ZQ2NI,B01H43AIES,B01H43EMF4,B01H5MUJSI,B01H640HTG,B01H6W0UH2,B01H71ND58,B01H71ND76,B01H71NDT4,B01H71NG0K,B01H7QSCVS,B01HAH5ZVY,B01HATTFWW,B01HB4BS1C,B01HBSH2EK,B01HBXID8Y,B01HC6G4D6,B01HC9ONI6,B01HCPNYR6,B01HD23OJG,B01HFUXIGA,B01HI5VPW6
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
A2QKTEBIMJIWCK,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A32D1MNV0928XZ,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A1O1D3OU5BP96X,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A10FO9BPP977EC,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A1PEXYDB1H25O7,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A3H9GGCYCTPN9C,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A8O3R240GUVY3,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A1TIQNQJZ2LDNW,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A274G6XNMXR4VO,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
AE97MPCYL5LRN,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
nz_result = get_non_zero_values(result)
nz_result

asin,B000FOI48G,B000PARERW,B000X7ST9Y,B001F51RAG,B0062N624M
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A2QKTEBIMJIWCK,0.0,0,0,5.0,0
A32D1MNV0928XZ,0.0,0,0,5.0,0
A1O1D3OU5BP96X,0.0,0,0,5.0,0
A10FO9BPP977EC,0.0,0,0,5.0,0
A1PEXYDB1H25O7,0.0,0,0,5.0,0
A3H9GGCYCTPN9C,0.0,0,0,5.0,0
A8O3R240GUVY3,0.0,0,0,5.0,0
A1TIQNQJZ2LDNW,0.0,0,0,5.0,0
A274G6XNMXR4VO,0.0,0,0,5.0,0
AE97MPCYL5LRN,0.0,0,0,4.5,0


In [None]:
# Building top N recommendation

In [None]:
recommendations = {}
for i in range(result.shape[0]):
    row = result.iloc[i]
    items = row[row > 0]
    recommendations[row.name] = list(items.index)

In [None]:
list(recommendations.values())

[['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B001F51RAG'],
 ['B000X7ST9Y', 'B001F51RAG'],
 ['B001F51RAG', 'B0062N624M'],
 ['B000PARERW', 'B001F51RAG'],
 ['B000FOI48G', 'B001F51RAG']]

In [None]:
nz_sample

asin,B001F51RAG,B002O2JXWS
A22J92A80WI059,5.0,5.0


In [None]:
flat = [item for row in list(recommendations.values()) for item in row]
flat = list(set(flat))
flat, len(flat)

(['B000X7ST9Y', 'B000FOI48G', 'B001F51RAG', 'B0062N624M', 'B000PARERW'], 5)

In [None]:
nz_sample

asin,B001F51RAG,B002O2JXWS
A22J92A80WI059,5.0,5.0


In [None]:
nz_sample_test

asin,B001F51RAG
A22J92A80WI059,5.0


In [None]:
hide_column

'B002O2JXWS'

In [None]:
hide_column in flat

False

In [None]:
for i in nz_sample.columns:
    if i in flat:
        print('T')
    else:
        print('F')

T
F


In [None]:
for i in nz_sample_test.columns:
    if i in flat:
        print('T')
    else:
        print('F')

T


In [None]:
for k in (list(range(5, 61, 5)) + list(range(70, 101, 10))):
  print('running for k:', k)
  model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=k, n_jobs=-1)
  model_knn.fit(pt_csr)
  hits = 0
  for query_index in range(pt_test.shape[0]):
    if query_index % 300 == 0:
      print('processing:', query_index)
    _input = pt_test.iloc[query_index]
    _input
    sample = _input.to_frame().T
    nz_sample = get_non_zero_values(sample)
    hide_index = np.random.randint(len(nz_sample.columns))
    hide_column = nz_sample.columns[hide_index]
    _input_test = _input.copy()
    _input_test.loc[hide_column] = 0
    sample_test = _input_test.to_frame().T
    # non zero entries
    nz_sample_test = get_non_zero_values(sample_test)
    distances, indices = model_knn.kneighbors(_input_test.values.reshape(1, -1), k)
    result = pt_train.iloc[indices[0]]
    nz_result = get_non_zero_values(result)
    recommendations = {}
    for i in range(result.shape[0]):
        row = result.iloc[i]
        items = row[row > 0]
        recommendations[row.name] = list(items.index)
    flat = [item for row in list(recommendations.values()) for item in row]
    flat = list(set(flat) - set(nz_sample_test.columns))
    if query_index % 10 == 0:
      print("User Has Rated:")
      for i in nz_sample_test.columns:
        x = get_product(i)
        if x:
          print(f"{i} - {x['title']} - #{x['brand']}")
      print("Items Recommended:")
      for i in flat[:10]:
        x = get_product(i)
        if x:
          print(f"{i} - {x['title']} - #{x['brand']}")
    if hide_column in flat[:10]:
        hits += 1
  print(f"k: {k}, test_size: {query_index + 1}, hits : {hits}, hit_rate: #{hits / (query_index + 1)}")

running for k: 15
processing: 0
User Has Rated:
B006WYJM8Y - NARS Blush, Gaiety - #NARS
Items Recommended:
B00021DJ32 - NARS Blush, Taj Mahal - #NARS
User Has Rated:
B01DDA2UJW - Makeup Brush, Toraway 5PC/Set Professional Eyebrow Brush Foundation Eyeliner Makeup Brushes Toothbrush - #
Items Recommended:
B00028LYO6 - J.R. Liggett Bar Shampoo, Tea Tree Oil Formula, 3.5 Ounce - #J.R. Liggett
B001QY8QXM - Astra Platinum Double Edge Safety Razor Blades ,100 Blades (20 x 5) - #Astra
B000X2FPXC - Dr. Woods Pure Almond Liquid Castile Soap, 32 Ounce - #Dr. Woods
B011F6EP1M - The Regenerator Anti Cellulite Cream with 12% Liposomal Vitamin C, 4 fl oz (120 ml) - #
B00YLLHQT2 - Korean Hair Booster Complete Protein Keratin Treatment Replenisher Therapy For All Types Of Damaged Hair - 25ml - #
B001J9R93W - Graham Jumbo End Papers For Hair Perms- 2.5 x 4- 1000ct" " - #Graham Professional
B01E950PCE - Glamglow Gravitymud Firming Treatment Deluxe Travel Size ~ 0.24 oz - #Glamglow
B000GLRREU - Waterpik U

In [None]:
cosine = [0.30659025787965616, 0.4054441260744986, 0.46131805157593125, 0.4283667621776504, 0.4355300859598854, 0.42550143266475643, 0.4140401146131805, 0.41260744985673353, 0.39255014326647564, 0.3968481375358166, 0.3997134670487106, 0.38825214899713467, 0.37965616045845274, 0.38108882521489973, 0.3868194842406877, 0.37965616045845274]
euclidean = [0.28796561604584525, 0.4197707736389685, 0.4154727793696275, 0.4040114613180516, 0.41117478510028654, 0.37965616045845274, 0.4040114613180516, 0.3939828080229226, 0.39255014326647564, 0.38968481375358166, 0.38108882521489973, 0.3939828080229226, 0.38968481375358166, 0.38825214899713467, 0.3825214899713467, 0.37392550143266473]
manhattan = [0.3137, 0.42406876790830944, 0.44412607449856734, 0.4340974212034384, 0.37679083094555876, 0.4283667621776504, 0.40974212034383956, 0.4140401146131805, 0.4154727793696275, 0.4154727793696275, 0.4169054441260745, 0.39255014326647564, 0.4197707736389685, 0.3997134670487106, 0.39255014326647564, 0.3624641833810888]
cosine = pd.Series(cosine, name='Cosine')
euclidean = pd.Series(euclidean, name='Euclidean')
manhattan = pd.Series(manhattan, name='Manhattan')

In [None]:
final = pd.DataFrame(columns=['K', 'Cosine', 'Euclidean', 'Manhattan'])
# final.index.names = ['K']
index = list(range(5, 51, 5))
final['Cosine'] = cosine
final['Euclidean'] = euclidean
final['Manhattan'] = manhattan
final = final.iloc[:10, :]
final = final * 100
final['K'] = index
final.round(2)

Unnamed: 0,K,Cosine,Euclidean,Manhattan
0,5,30.66,28.8,31.37
1,10,40.54,41.98,42.41
2,15,46.13,41.55,44.41
3,20,42.84,40.4,43.41
4,25,43.55,41.12,37.68
5,30,42.55,37.97,42.84
6,35,41.4,40.4,40.97
7,40,41.26,39.4,41.4
8,45,39.26,39.26,41.55
9,50,39.68,38.97,41.55


In [None]:
pd.DataFrame({'Max Hit_Rate %': [46.13, 41.98, 44.41], 'k':[15, 10, 15]}, index=['Cosine', 'Euclidean', 'Manhattan'])

Unnamed: 0,Max Hit_Rate %,k
Cosine,46.13,15
Euclidean,41.98,10
Manhattan,44.41,15


In [None]:
knn_mo