In [1]:
import pandas as pd
import numpy as np

In [2]:
customers = pd.read_json("customers.json")
customers.head()

Unnamed: 0,Id,NickName
0,103603,1000kgthanh
1,103760,999999999ok
2,103829,ac7ive
3,1,admin
4,103839,ahkk.nguyen


In [3]:
products = pd.read_json("products.json")
products.head()

Unnamed: 0,Id,Name,UnitPrice
0,1,Build your own computer,1200.0
1,2,Digital Storm VANQUISH 3 Custom Performance PC,1259.0
2,3,Lenovo IdeaCentre 600 All-in-One PC,500.0
3,4,Apple MacBook Pro 13-inch,1800.0
4,5,Asus N551JK-XO076H Laptop,1500.0


In [4]:
ratings = pd.read_json("ratings.json")
ratings.head()

Unnamed: 0,CustomerID,ProductID,Rate,CreateDate
0,103416,619,1,2018/01/01 01:36:30
1,103654,411,1,2018/01/01 01:36:35
2,103954,298,3,2018/01/01 01:36:38
3,103672,361,5,2018/01/01 01:37:15
4,103960,536,5,2018/01/01 02:36:25


In [5]:
customers.shape

(678, 2)

In [6]:
products.shape

(691, 3)

In [7]:
ratings.shape

(130754, 4)

In [8]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 678 entries, 0 to 677
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Id        678 non-null    int64 
 1   NickName  678 non-null    object
dtypes: int64(1), object(1)
memory usage: 10.7+ KB


In [9]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 691 entries, 0 to 690
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         691 non-null    int64  
 1   Name       691 non-null    object 
 2   UnitPrice  691 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 16.3+ KB


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130754 entries, 0 to 130753
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   CustomerID  130754 non-null  int64 
 1   ProductID   130754 non-null  int64 
 2   Rate        130754 non-null  int64 
 3   CreateDate  130754 non-null  object
dtypes: int64(3), object(1)
memory usage: 4.0+ MB


In [11]:
customers.isna().sum()

Id          0
NickName    0
dtype: int64

In [12]:
products.isna().sum()

Id           0
Name         0
UnitPrice    0
dtype: int64

In [13]:
ratings.isna().sum()

CustomerID    0
ProductID     0
Rate          0
CreateDate    0
dtype: int64

In [14]:
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

In [15]:
figure = px.histogram(x=ratings['Rate'], title="Number of each rating")
figure.update_layout(bargap=0.2)
figure.show()

In [16]:
ratings.columns

Index(['CustomerID', 'ProductID', 'Rate', 'CreateDate'], dtype='object')

In [17]:
ratings['CreateDate'] = pd.to_datetime(ratings['CreateDate'])

In [18]:
product_df = pd.DataFrame({'ratings_no': ratings.groupby('ProductID').count()['Rate'], 'ratings_mean': ratings.groupby('ProductID').mean()['Rate']})
product_df = product_df.sort_values('ratings_no', ascending=False)
product_df

Unnamed: 0_level_0,ratings_no,ratings_mean
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1
326,222,3.130631
364,222,3.234234
292,217,3.262673
486,216,3.240741
313,216,3.250000
...,...,...
278,162,3.148148
247,160,3.187500
205,159,3.157233
308,156,3.102564


In [19]:
fig = sp.make_subplots(rows=1, cols=2, subplot_titles=('Distribution of Number of Ratings', 'Distribution of Mean Ratings'))

hist_ratings_no = go.Histogram(x=product_df['ratings_no'], name='Number of Rating')
fig.add_trace(hist_ratings_no, row=1, col=1)

hist_ratings_mean = go.Histogram(x=product_df['ratings_mean'], name='Mean Rating')
fig.add_trace(hist_ratings_mean, row=1, col=2)

fig.update_layout(width=800,title_text='Distribution of Ratings', showlegend=False)

fig.show()

In [20]:
figure = px.scatter(product_df,
                    x='ratings_no',
                    y='ratings_mean',
                    trendline='ols')

figure.show()

In [21]:
ratings.columns

Index(['CustomerID', 'ProductID', 'Rate', 'CreateDate'], dtype='object')

In [22]:
unique_users = ratings['CustomerID'].nunique()

print("Number of unique users: ", unique_users)

Number of unique users:  344


In [23]:
unique_products = ratings['ProductID'].nunique()

print("Number of unique products: ", unique_products)

Number of unique products:  691


In [24]:
print("Rating matrix will have: ", unique_users * unique_products, 'elements')

Rating matrix will have:  237704 elements


In [25]:
# Advantages of matrix factorization: as number of products and users increase matrix size increases by n**2
# Thus storing a full matrix in memory would be a challenge, but matrix factorization realizes rating matrix implicitly thus we don't need to store all the data
print("Number of ratings: ", len(ratings))
print(len(ratings)/(unique_users * unique_products) * 100, '% of the matrix is filled.')

Number of ratings:  130754
55.0070676135025 % of the matrix is filled.


In [26]:
products.columns

Index(['Id', 'Name', 'UnitPrice'], dtype='object')

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors

user_ratings = ratings.merge(customers, left_on='CustomerID', right_on='Id', how="left")

user_item_matrix = user_ratings.pivot_table(index='CustomerID', columns='ProductID', values='Rate', fill_value=0)

print(user_ratings.shape, ratings.shape, customers.shape)


(130754, 6) (130754, 4) (678, 2)


In [28]:
X_train, X_test = train_test_split(user_item_matrix, test_size=0.2, random_state=42)

knn_model = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute', n_jobs=-1)
knn_model.fit(X_train)

def get_recommendations(user_id, num_recommendations=6):
    user_index = X_train.index.get_loc(user_id)
    distances, indices = knn_model.kneighbors(X_train.iloc[user_index, :].values.reshape(1, -1), n_neighbors=num_recommendations + 1)
    
    recommended_products = []
    for i in range(1, num_recommendations + 1):
        product_id = X_train.columns[indices.flatten()[i]]
        recommended_products.append(product_id)
    
    return recommended_products

user_id = 103829
recommended_product_ids = get_recommendations(user_id)

# Display the recommended product names
recommended_products = products[products['Id'].isin(recommended_product_ids)]

In [29]:
import pickle

# Save to file in the current working directory
pkl_filename = "knn_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(knn_model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)


##loading the model from the saved file
pkl_filename = "knn_model.pkl"
with open(pkl_filename, 'rb') as f_in:
    model = pickle.load(f_in)

In [30]:
predictValue = get_recommendations( 103981)
predictValue

[61, 98, 246, 247, 218, 89]

In [31]:
pickle.dump(products,open('products.pkl','wb'))
pickle.dump(products.to_dict(),open('product_dict.pkl','wb'))

In [32]:
pickle.dump(customers,open('customers.pkl','wb'))
pickle.dump(customers.to_dict(),open('customer_dict.pkl','wb'))

In [33]:
pickle.dump(X_train,open('X_train.pkl','wb'))