## **Importing the libraries**

In [1]:
import pandas as pd
import numpy as np

## **Importing the dataset**

In [2]:
data = pd.read_csv("ratings_Beauty.csv")

## **Inspecting and visualizing the dataset**

### General property of the dataset

In [3]:
# Check the shape of the dataset
print("Dataset shape: ", data.shape)

# Look at the first few rows of the dataset
print("\nFirst few rows of the dataset: \n")
print(data.head())

Dataset shape:  (2023070, 4)

First few rows of the dataset: 

           UserId   ProductId  Rating   Timestamp
0  A39HTATAQ9V7YF  0205616461     5.0  1369699200
1  A3JM6GV9MNOF9X  0558925278     3.0  1355443200
2  A1Z513UWSAAO0F  0558925278     5.0  1404691200
3  A1WMRR494NWEWV  0733001998     4.0  1382572800
4  A3IAAVS479H7M7  0737104473     1.0  1274227200


In [4]:
# Check if there are duplicate rows
duplicates = data.duplicated(["UserId", "ProductId", "Rating", "Timestamp"]).sum()
print("Number of duplicated entries: ", duplicates)

# Find the number of unique user, number of unique products, and number of total ratings
print("\nNumber of unique users: ", len(data.UserId.unique()))
print("Number of unique products: ", len(data.ProductId.unique()))
print("Number of total ratings: ", data.shape[0])

Number of duplicated entries:  0

Number of unique users:  1210271
Number of unique products:  249274
Number of total ratings:  2023070


In [5]:
# Check if there are any null entries
print("Are there any null entries?\n")
print(data.isnull().any())

Are there any null entries?

UserId       False
ProductId    False
Rating       False
Timestamp    False
dtype: bool


In [6]:
# Compare the number of unique users, number of unique products, and total number of entries
import plotly.graph_objects as graph
x_axis = ["Total number of entries", "Number of unique users", "Number of unique products"]
y_axis = [len(data), len(data["UserId"].unique()), len(data["ProductId"].unique())]

plot = graph.Figure([graph.Bar(x = x_axis, y = y_axis, textposition = "auto")])
plot.update_layout(title_text = "Total number of entries, number of unique users, and number of unique products",
                   xaxis_title = "Category",
                   yaxis_title = "Number")

plot.show()

### About the ratings

In [7]:
# Number of ratings grouped by rating value
print("Rating values and the corresponding number of ratings:\n")
print(data["Rating"].value_counts())

plot = graph.Figure([graph.Bar(x = data["Rating"].value_counts().index, y = list(data["Rating"].value_counts()), textposition = "auto")])

plot.update_layout(title_text = "Number of ratings grouped by rating value",
                   xaxis_title = "Rating value",
                   yaxis_title = "Number of ratings")

plot.show()

Rating values and the corresponding number of ratings:

5.0    1248721
4.0     307740
1.0     183784
3.0     169791
2.0     113034
Name: Rating, dtype: int64


### About the users

In [8]:
# Number of products rated by each user
product_users = data.groupby("UserId")["ProductId"].count().sort_values(ascending = False)

print("Number of products rated by each user:\n")
print(product_users)

Number of products rated by each user:

UserId
A3KEZLJ59C1JVH           389
A281NPSIMI1C2R           336
A3M174IC0VXOS2           326
A2V5R832QCSOMX           278
A3LJLRIZL38GG3           276
                        ... 
A3BQ47C773YMU1             1
A3BQ3Y37XL049D             1
A3BQ3NGQ3JJBR3             1
A3BQ3BW37JKZZ4             1
A00008821J0F472NDY6A2      1
Name: ProductId, Length: 1210271, dtype: int64


In [9]:
# Number of ratings given by each user
# Assume that each user can only give one rating to each product, this should produce the same result as the cell above
rated_users = data.groupby("UserId")["Rating"].count().sort_values(ascending = False)

print("Number of ratings given by each user:\n")
print(rated_users)

plot = graph.Figure(data = [graph.Histogram(x = rated_users)])

plot.update_layout(title_text = "Number of ratings and number of users who give that amount of ratings",
                   xaxis_title = "Number of ratings",
                   yaxis_title = "Number of users")

plot.show()

Number of ratings given by each user:

UserId
A3KEZLJ59C1JVH           389
A281NPSIMI1C2R           336
A3M174IC0VXOS2           326
A2V5R832QCSOMX           278
A3LJLRIZL38GG3           276
                        ... 
A3BQ47C773YMU1             1
A3BQ3Y37XL049D             1
A3BQ3NGQ3JJBR3             1
A3BQ3BW37JKZZ4             1
A00008821J0F472NDY6A2      1
Name: Rating, Length: 1210271, dtype: int64


### About the products

In [10]:
# Number of ratings received by each product
rated_products = data.groupby("ProductId")["Rating"].count().sort_values(ascending = False)

print("Number of ratings received by each product:\n")
print(rated_products)

plot = graph.Figure(data = [graph.Histogram(x = rated_products)])

plot.update_layout(title_text = "Number of ratings and number of products that receive that amount of ratings",
                    xaxis_title = "Number of ratings ",
                    yaxis_title = "Number of products")

plot.show()

Number of ratings received by each product:

ProductId
B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
              ... 
B005KEH11C       1
B005KECH48       1
B005KDU5XO       1
B005KDRZCS       1
0205616461       1
Name: Rating, Length: 249274, dtype: int64


In [11]:
# Number of users that rate each product
# Assume that each user can only give one rating to each product, this should produce the same result as the cell above
user_products = data.groupby("ProductId")["UserId"].count().sort_values(ascending = False)

print("Number of users that rate each product:\n")
print(user_products)

Number of users that rate each product:

ProductId
B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
              ... 
B005KEH11C       1
B005KECH48       1
B005KDU5XO       1
B005KDRZCS       1
0205616461       1
Name: UserId, Length: 249274, dtype: int64


In [12]:
# Number of products that receive a minimum or maximum of some ratings
rated_products_df = pd.DataFrame(rated_products)

less_than_ten = []
ten_to_fifty = []
fifty_to_hundred = []
hundred_or_more = []
total_rating = []

for rating in rated_products_df['Rating']:
    if rating < 10:
        less_than_ten.append(rating)
    if rating >= 10 and rating < 50:
        ten_to_fifty.append(rating)
    if rating >= 50 and rating < 100:
        fifty_to_hundred.append(rating)
    if rating >= 100:
        hundred_or_more.append(rating)

    total_rating.append(rating)

print("Number of products with < 10 ratings: ", len(less_than_ten))
print("Number of products with >= 10 and < 50 ratings: ", len(ten_to_fifty))
print("Number of products with >= 50 and < 100 ratings: ", len(fifty_to_hundred))
print("Number of products with >= 100 ratings: ", len(hundred_or_more))
print("Average number of products rated by users: ", np.mean(total_rating))

x_axis = ["Number of products with < 10 ratings","Number of products with >= 10 and < 50 ratings",
           "Number of products with >= 50 and < 100 ratings","Number of products with >= 100 ratings"]
y_axis = [len(less_than_ten),len(ten_to_fifty),len(fifty_to_hundred),
            len(hundred_or_more)]


plot = graph.Figure([graph.Bar(x = x_axis, y = y_axis, textposition = "auto")])

plot.update_layout(title_text = "Number of ratings and the number of products that received that amount of ratings",
                    xaxis_title = "Number of ratings",
                    yaxis_title = "Number of products")
plot.show()

Number of products with < 10 ratings:  212336
Number of products with >= 10 and < 50 ratings:  29967
Number of products with >= 50 and < 100 ratings:  4252
Number of products with >= 100 ratings:  2719
Average number of products rated by users:  8.115848423822781


In [13]:
# Most popular products
print("The top 5 products that occurred the most:\n")
print(data["ProductId"].value_counts().nlargest(5))

plot = graph.Figure([graph.Bar(x = data["ProductId"].value_counts().nlargest(5).index, y = list(data["ProductId"].value_counts()), textposition = "auto")])

plot.update_layout(title_text = "The top 5 products that occurred the most",
                   xaxis_title = "ProductId",
                   yaxis_title = "Number of occurence")

plot.show()

The top 5 products that occurred the most:

B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
Name: ProductId, dtype: int64


## **Encoding the data**

In [14]:
# Encode alphanumerical data as numerical data
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

encoded_data = data
encoded_data["User"] = label_encoder.fit_transform(data["UserId"])
encoded_data["Product"] = label_encoder.fit_transform(data["ProductId"])

print(encoded_data.head())

           UserId   ProductId  Rating   Timestamp    User  Product
0  A39HTATAQ9V7YF  0205616461     5.0  1369699200  725046        0
1  A3JM6GV9MNOF9X  0558925278     3.0  1355443200  814606        1
2  A1Z513UWSAAO0F  0558925278     5.0  1404691200  313101        1
3  A1WMRR494NWEWV  0733001998     4.0  1382572800  291075        2
4  A3IAAVS479H7M7  0737104473     1.0  1274227200  802842        3


In [15]:
# Find the average ratings by each user 
average_rating = encoded_data.groupby("User")["Rating"].mean()
print("Average rating given by each user:\n")
print(average_rating.head())

# Merge it with the dataset 
encoded_data = pd.merge(encoded_data, average_rating, on = "User")
print("\n\nDataset:\n")
print(encoded_data.head())

# Rename the columns
encoded_data = encoded_data.rename(columns = {"Rating_x": "Original_rating", "Rating_y": "Average_rating"})
print("\n\nDataset:\n")
print(encoded_data.head())

Average rating given by each user:

User
0    5.0
1    5.0
2    3.0
3    5.0
4    5.0
Name: Rating, dtype: float64


Dataset:

           UserId   ProductId  Rating_x   Timestamp    User  Product  Rating_y
0  A39HTATAQ9V7YF  0205616461       5.0  1369699200  725046        0      4.25
1  A39HTATAQ9V7YF  B002OVV7F0       3.0  1369699200  725046    81854      4.25
2  A39HTATAQ9V7YF  B0031IH5FQ       5.0  1369699200  725046    89013      4.25
3  A39HTATAQ9V7YF  B006GQPZ8E       4.0  1369699200  725046   154092      4.25
4  A3JM6GV9MNOF9X  0558925278       3.0  1355443200  814606        1      3.50


Dataset:

           UserId   ProductId  Original_rating  ...    User  Product  Average_rating
0  A39HTATAQ9V7YF  0205616461              5.0  ...  725046        0            4.25
1  A39HTATAQ9V7YF  B002OVV7F0              3.0  ...  725046    81854            4.25
2  A39HTATAQ9V7YF  B0031IH5FQ              5.0  ...  725046    89013            4.25
3  A39HTATAQ9V7YF  B006GQPZ8E              4.0 

In [16]:
# Normalize the ratings
encoded_data["Normalized_rating"] = encoded_data["Original_rating"] - encoded_data["Average_rating"]

print("Data with normalized ratings:\n")
print(encoded_data.head())

Data with normalized ratings:

           UserId   ProductId  ...  Average_rating  Normalized_rating
0  A39HTATAQ9V7YF  0205616461  ...            4.25               0.75
1  A39HTATAQ9V7YF  B002OVV7F0  ...            4.25              -1.25
2  A39HTATAQ9V7YF  B0031IH5FQ  ...            4.25               0.75
3  A39HTATAQ9V7YF  B006GQPZ8E  ...            4.25              -0.25
4  A3JM6GV9MNOF9X  0558925278  ...            3.50              -0.50

[5 rows x 8 columns]


## **Filtering the data**

In [24]:
# Remove products that have a small number of ratings
rated_products_encoded = encoded_data.groupby("Product")["Original_rating"].count()
rated_products_encoded_df = pd.DataFrame(rated_products_encoded)

filtered_rated_products = rated_products_encoded_df[rated_products_encoded_df.Original_rating >= 200]
print("The unique products with >= 200 ratings:\n")
print(filtered_rated_products.head())
print("\nThe number of unique products with >= 200 ratings:", filtered_rated_products.shape[0])

# Keep the remaining products
popular_products = filtered_rated_products.index.tolist()
remaining_data = encoded_data[encoded_data["Product"].isin(popular_products)]

print("\nThe number of rows in the dataset has changed from ", encoded_data.shape[0], " to ", remaining_data.shape[0])

The unique products with >= 200 ratings:

         Original_rating
Product                 
704                  558
719                  377
754                  288
834                  412
843                  313

The number of unique products with >= 200 ratings: 934

The number of rows in the dataset has changed from  2023070  to  370511


## **Creating the user-item matrix** 

In [25]:
# Create a matrix or table where each row corresponds to a user and each column represents a product
user_item_matrix = pd.pivot_table(remaining_data, values = "Normalized_rating", index = "UserId", columns = "Product")
user_item_matrix = user_item_matrix.fillna(0)

print("The first 5 rows of the user_item_matrix look like:\n")
print(user_item_matrix.head(5))

The first 5 rows of the user_item_matrix look like:

Product                704     719     754     ...  247603  249109  249211
UserId                                         ...                        
A0010876CNE3ILIM9HV0      0.0     0.0     0.0  ...     0.0     0.0     0.0
A0011102257KBXODKL24I     0.0     0.0     0.0  ...     0.0     0.0     0.0
A00120381FL204MYH7G3B     0.0     0.0     0.0  ...     0.0     0.0     0.0
A00126503SUWI86KZBMIN     0.0     0.0     0.0  ...     0.0     0.0     0.0
A001573229XK5T8PI0OKA     0.0     0.0     0.0  ...     0.0     0.0     0.0

[5 rows x 934 columns]


## **Finding the top k users who are most similar to a chosen user**

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
import operator

# Randomly select a user from the user_item_matrix to be our customer of concern
random_row_num = np.random.randint(0, user_item_matrix.shape[0] + 1)
users = user_item_matrix.index.tolist()
user_id = users[random_row_num]

print("The chosen user is: ", user_id)

The chosen user is:  A21RYR788TRJZR


In [46]:
# Define a function that finds the top k users similar to the chosen user
def top_k_similar (user_id, user_item_matrix, k):

  # Parameters:
  # user_id: the chosen user that we want to recommend products to
  # user_item_matrix: the user_item_matrix previously computed
  # k: the number of other users who are most similar to the chosen user

  # Return value:
  # similar_users: a list of k users who are most similar to the chosen user

  # Create dataframes for the chosen users and other users
  chosen_user = user_item_matrix[user_item_matrix.index == user_id]
  other_users = user_item_matrix[user_item_matrix.index != user_id]

  # Compute the cosine similarity between the chosen user and other users
  # Take only index 0 because we only want to look at the cosine similarities wrt to the chosen user
  similarities = cosine_similarity(chosen_user, other_users)[0].tolist()

  # Pair up the other users' id with their respective cosine similarity values with the chosen user
  other_users_indices = other_users.index.tolist()
  index_similarity = dict(zip(other_users_indices, similarities))

  # Sort index_similarity by the values of similarities in descending order
  index_similarity_sorted = sorted(index_similarity.items(), key = operator.itemgetter(1))
  index_similarity_sorted.reverse()

  # Get the top k most similar users
  top_similar = index_similarity_sorted[:k]
  similar_users = []

  for user in top_similar:
    similar_users.append(user[0])

  return similar_users

In [48]:
# Find the top 5 users that are most similar to the chosen user
k = 5

similar_users = top_k_similar(user_id, user_item_matrix, k)
print("The top {} users that are most similar to the chosen user are {}".format(k, similar_users))

The top 5 users that are most similar to the chosen user are:  ['AZZZRS1YZ8HVP', 'AZZZLM1E5JJ8C', 'AZZZKHVV482YT', 'AZZYW4YOE1B6E', 'AZZWMH759YWOO']


## **Recommending top m products to the chosen user**

In [58]:
# Define a function that recommends the top m products to the chosen user 
def top_m_products(user_id, similar_users, user_item_matrix, m):

  # Parameters:
  # user_id: the chosen user that we want to recommend products to
  # similar_users: the top k users who are most similar to the chosen user
  # user_item_matrix: the user_item_matrix previously computed
  # m: the number of products that we want to recommend to the chosen user

  # Return values:
  # top_products_indices: the top m products that we want to recommend to the chosen user

  # Get the top k most similar users' information from the encoded_data dataset
  similar_user_products = encoded_data[encoded_data.UserId.isin(similar_users)]

  # Get the top k most similar users' ratings from the user_item_matrix
  similar_user_ratings = user_item_matrix[user_item_matrix.index.isin(similar_users)]

  # Get the mean of the ratings given by similar users
  similar_user_mean = similar_user_ratings.mean(axis = 0)
  similar_user_mean_df = pd.DataFrame(similar_user_mean, columns = ["Mean"])

  # Get the ratings of the chosen user from the user_item_matrix
  chosen_user_ratings = user_item_matrix[user_item_matrix.index == user_id]

  # Transpose the dataframe for easier filtering
  # After transposing, there will be a column vector where each entry is a rating and the index (each row) is product
  chosen_user_ratings_trans = chosen_user_ratings.transpose()
  chosen_user_ratings_trans.columns = ["Rating"]

  # Find the products that the chosen user has not rated yet
  chosen_user_ratings_trans = chosen_user_ratings_trans[chosen_user_ratings_trans["Rating"] == 0]
  products_not_rated = chosen_user_ratings_trans.index.tolist()

  # Filter the mean ratings of similar users for only products that the chosen user has not rated yet
  similar_user_mean_df_filtered = similar_user_mean_df[similar_user_mean_df.index.isin(products_not_rated)]

  # Get the top m products
  similar_user_mean_df_ordered = similar_user_mean_df_filtered.sort_values(by = ["Mean"], ascending = False)

  top_products = similar_user_mean_df_ordered.head(m)
  top_products_indices = top_products.index.tolist()

  return top_products_indices

In [59]:
# Find the top 5 recommended products
m = 5

top_products = top_m_products(user_id, similar_users, user_item_matrix, m)
print("The top {} recommended products are {} ".format(m, top_products))

The top 5 recommended products are:  [30773, 27327, 149282, 122707, 122630]
