In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-ratings/ratings_Beauty.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

In [3]:
data = pd.read_csv("/kaggle/input/amazon-ratings/ratings_Beauty.csv")
data.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [4]:
train_df, test_df = train_test_split(data, train_size=0.8)

In [5]:
index_labels = train_df.index
subset_index = np.random.choice(index_labels, size=10000, replace=False)
train_subset = train_df.loc[subset_index]

# **user-item-matrix**

In [6]:
user_item_matrix = train_subset.pivot_table(index='UserId', columns='ProductId', values='Rating', fill_value=0)
user_item_matrix

ProductId,1304651088,5357955905,9575295714,9748776093,9788071538,9788071597,9788072216,9788073840,9790782594,9790786948,...,B00KHH2VOY,B00KIRR02G,B00KLZO2JE,B00KQ4PEBU,B00KTLBDYM,B00KWFDBKE,B00KXNYG9K,B00L1OKV1W,B00L2K53BS,B00L5JHZJO
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01043533995GF77S33JK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A01884683H3F0505B7RAB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A02319972Z43K4PFMZIBM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A02734382TCW13I4YD1LE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0351533X23EPP6SMRN6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZYXKS6KJ0T0F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZZ7SCDDZR0R3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZZEK6JAYP3HX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZZLR8HHKHBCZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# **neighbor_finder**

In [7]:
neighbor_finder = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)

# **user-based**

In [8]:
neighbor_finder.fit(user_item_matrix)

user_id = np.random.choice(user_item_matrix.index)
user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)

distances, indices = neighbor_finder.kneighbors(user_vector, n_neighbors=5)
distances, indices = distances.squeeze().tolist(), indices.squeeze().tolist()



In [9]:
aggregated_ratings = {}
# Aggregate ratings from similar users
for i, distance in zip(indices, distances):
    similar_user_id, similarity_score = user_item_matrix.index[i], 1 - distance
    for item_id, rating in user_item_matrix.loc[similar_user_id].items():
        if (rating != 0) and (user_item_matrix.loc[user_id][item_id] == 0):
            aggregated_ratings.setdefault(item_id, []).append((rating, similarity_score))

In [10]:
predicted_ratings = {}
# Compute weighted average of ratings from similar users
for item_id, ratings in aggregated_ratings.items():
    weighted_sum = sum(rating * similarity for rating, similarity in ratings)
    sum_of_weights = sum(similarity for _, similarity in ratings)
    predicted_ratings[item_id] = weighted_sum / sum_of_weights if sum_of_weights != 0 else 0

In [11]:
N=3
recommended_products = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:N]
recommended_products

['B00DQUVXPW', 'B001V9LUZI', 'B002XHACJW']

# **item-based**

## method1

In [12]:
neighbor_finder.fit(user_item_matrix.T)

item_id = np.random.choice(user_item_matrix.columns)
item_vector = user_item_matrix[item_id].values.reshape(1, -1)

distances, indices = neighbor_finder.kneighbors(item_vector, n_neighbors=5)
distances, indices = distances.squeeze().tolist(), indices.squeeze().tolist()



In [13]:
aggregated_ratings = {}
# Aggregate ratings from similar items
for i, distance in zip(indices, distances):
    similar_item_id, similarity_score = user_item_matrix.columns[i], 1 - distance
    for user_id, rating in user_item_matrix[similar_item_id].items():
        if (rating != 0) and (user_item_matrix.loc[user_id][item_id] == 0):
            aggregated_ratings.setdefault(user_id, []).append((rating, similarity_score))

In [14]:
predicted_ratings = {}
# Compute weighted average of ratings from similar items
for user_id, ratings in aggregated_ratings.items():
    weighted_sum = sum(rating * similarity for rating, similarity in ratings)
    sum_of_weights = sum(similarity for _, similarity in ratings)
    predicted_ratings[user_id] = weighted_sum / sum_of_weights if sum_of_weights != 0 else 0

In [15]:
N=3
recommended_users = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:N]
recommended_users

['A1ELHG0LNQT7P8', 'AHO0EM3FIG4JN', 'A2C0Q7ALQ22IRK']

## method2

In [16]:
user_id = np.random.choice(user_item_matrix.index)
item_id = user_item_matrix.columns[np.argmax(user_item_matrix.loc[user_id])]

In [17]:
item_neighbor_finder = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1).fit(user_item_matrix.T)
item_vector = user_item_matrix[item_id].values.reshape(1, -1)

_, indices = neighbor_finder.kneighbors(item_vector, n_neighbors=5)
indices = indices.squeeze().tolist()



In [18]:
user_item_matrix.columns[indices].values

array(['B0001ZA4CS', 'B005C1C02S', 'B005C2NBFW', 'B005C3ZYWE',
       'B005C2NAKS'], dtype=object)