In [237]:
import pandas as pd
import pickle
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler

# dataset is accessible at https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt (https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz)
df = pd.read_csv('amazon_reviews_us_Digital_Software_v1_00.tsv', sep='\t', dtype={'star_rating': float})

df.columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating',
              'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']

df.drop(['marketplace', 'product_parent', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline'],
        axis='columns', inplace=True)

# Using just a subset of the data (the one with more common id in order for there to be more correlation)

review_counts = df['customer_id'].value_counts()
customers_with_multiple_reviews = review_counts[review_counts > 2].index
df = df[df['customer_id'].isin(customers_with_multiple_reviews)]

#df = df.sort_values(by=['product_id', 'customer_id'])

# df = df[:10000]

print(df.shape)

(4104, 8)


In [238]:
def apply_pivot(df, fillby=None):
    pivot_table = df.pivot_table(index='customer_id', columns='product_id', values='star_rating')
    if fillby is not None:
        pivot_table = pivot_table.fillna(fillby)
    return pivot_table

# Train-test split (training on older data to predict more recent data)
df = df.sort_values(by='review_date')

split_index = int(0.7 * len(df))

train = df[:split_index]
test = df[split_index:]

test = test[test.customer_id.isin(train.customer_id)] # to guarantee known customer ids in test

df_train_pivot = apply_pivot(df = train, fillby = 0)
df_test_pivot = apply_pivot(df = test, fillby = 0)

print(df_test_pivot.loc[1732109])

print(df_test_pivot)

product_id
B001O5CHVU    0.0
B001VH7HRA    0.0
B002N5M57G    0.0
B002VPE3FK    0.0
B003ZK51W4    0.0
             ... 
B00XKDUGBW    0.0
B00YGC5R3S    0.0
B01019BM7O    0.0
B01019BOEA    0.0
B012Y7R126    0.0
Name: 1732109, Length: 231, dtype: float64
product_id   B001O5CHVU  B001VH7HRA  B002N5M57G  B002VPE3FK  B003ZK51W4  \
customer_id                                                               
1413196             0.0         0.0         0.0         0.0         0.0   
1419419             0.0         0.0         0.0         0.0         0.0   
1501224             0.0         0.0         0.0         0.0         0.0   
1514980             0.0         0.0         0.0         0.0         0.0   
1732109             0.0         0.0         0.0         0.0         0.0   
...                 ...         ...         ...         ...         ...   
53049963            0.0         0.0         0.0         0.0         0.0   
53051282            0.0         0.0         0.0         0.0         0.0  

In [239]:
# Creating dummies to help to know wether a product has been rated or not
# Train
dummy_train = train.copy()

# Exclude products already rated by the user
# Obtain a table with 0 when products have been rated and 1 when they haven't
dummy_train['star_rating'] = dummy_train['star_rating'].apply(lambda x: 0 if x >= 1 else 1) 
dummy_train = apply_pivot(df = dummy_train, fillby = 1)

# Exclude products not rated by the user
# Obtain a table with 1 when products have been rated and 0 when they haven't
dummy_test = test.copy()
dummy_test['star_rating'] = dummy_test['star_rating'].apply(lambda x: 1 if x >= 1 else 0)
dummy_test = apply_pivot(df = dummy_test, fillby = 0)

In [240]:
# Calculate mean and subtract it from ratings for ajusted cosine similarity
train_pivot = apply_pivot(df=train)
mean = train_pivot.mean(axis=1, skipna=True)

df_train_subtracted = train_pivot.sub(mean, axis=0)

# Set ratings to 0 where a user hasn't given any rating
df_train_subtracted.fillna(0, inplace=True)

In [241]:
# Create the User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_train_subtracted, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0

user_correlation_df = pd.DataFrame(user_correlation, index=df_train_subtracted.index, columns=df_train_subtracted.index)

user_correlation_df.to_csv('user_correlation.txt', sep='\t')

In [242]:
# Rating predicted by the user is weighted sum of correlation with the product rating 
user_predicted_ratings = np.dot(user_correlation, df_train_pivot)

# Products not rated by user 
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)

In [243]:
#Filter user correlation only for customer_id which is in test
user_correlation_test_df = user_correlation_df[user_correlation_df.index.isin(test.customer_id)]
user_correlation_test_df = user_correlation_test_df[list(set(test.customer_id))]

#Get test user predicted rating
test_user_predicted_ratings = np.dot(user_correlation_test_df, df_test_pivot)
test_user_predicted_ratings = np.multiply(test_user_predicted_ratings,dummy_test)
test_user_predicted_ratings = test_user_predicted_ratings[test_user_predicted_ratings > 0]

scaler = MinMaxScaler(feature_range=(1, 5))
scaler.fit(test_user_predicted_ratings)
test_user_predicted_ratings = scaler.transform(test_user_predicted_ratings)

predicted_ratings_df = pd.DataFrame(test_user_predicted_ratings, index=df_test_pivot.index, columns=df_test_pivot.columns)

predicted_ratings_df = predicted_ratings_df.fillna(0)

predicted_ratings_df.to_csv('predicted_ratings_df.txt', sep='\t')

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [244]:
# Calculate RMSE
total_non_nan = np.count_nonzero(~np.isnan(test_user_predicted_ratings))
rmse = (np.sum(np.sum((df_test_pivot - test_user_predicted_ratings)**2))/total_non_nan)**0.5
print("RMSE:" + str(rmse))

RMSE:2.154120278392164


In [248]:
# Calculate precision at k

def precision_at_k(actual, predicted, k):
    precision_values = []
    for user in actual.index:
        actual_items = actual.loc[user]  # Actual user preferences
        rated_items = actual_items[actual_items > 0].index  # Items rated by the user
        predicted_items = predicted.loc[user].nlargest(k).index # Top K recommended items
        num_relevant = len(set(rated_items) & set(predicted_items))  # Number of relevant items
        precision = num_relevant / k  # Precision at K
        precision_values.append(precision)
    average_precision = np.mean(precision_values)
    return average_precision

k = 5

precision = precision_at_k(df_test_pivot, predicted_ratings_df, k)
print("Precision at "+ str(k) + ": " + str(precision))

Precision at 5: 0.014285714285714287


In [246]:
pickle.dump(user_final_rating,open('./user_final_rating.pkl','wb'))