In [70]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler

# dataset is accessible at https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt (https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz)
df = pd.read_csv('amazon_reviews_us_Digital_Software_v1_00.tsv', sep='\t', dtype={'star_rating': float})

df.columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating',
              'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']

# Using just a subset of the data

df = df.sort_values(by=['product_id', 'customer_id'])

df = df[:30000]

print(df)

       marketplace  customer_id       review_id  product_id  product_parent  \
2793            US     14297784   R1RDB9C91XPF0  B000JLNHO6       165855688   
15096           US     19802240  R2MTDG51GV59MS  B000YMNI2Q       847631772   
99226           US     39503814  R2LRAB7GM83D56  B000YMNI2Q       847631772   
101835          US     46098046  R1GGJJA2R68033  B000YMNI2Q       847631772   
16484           US     47994949  R1L550H2C42K7P  B000YMNI2Q       847631772   
...            ...          ...             ...         ...             ...   
43177           US     11174432   RHB0DLGQL1BLJ  B009IRP3U8       998215621   
92131           US     11914566   R6HKFAKYW6KDC  B009IRP3U8       998215621   
38525           US     14573819  R3F2YW0EXPNDKI  B009IRP3U8       998215621   
86699           US     16334332  R3QF6O7V28Z3GT  B009IRP3U8       998215621   
42442           US     19908116  R2ENG1MVKWTUOS  B009IRP3U8       998215621   

                                product_title  prod

In [71]:
def apply_pivot(df,fillby = None):
    if fillby is not None:
        return df.pivot_table(index='customer_id', columns='product_id',values='star_rating').fillna(fillby)
    return df.pivot_table(index='customer_id', columns='product_id', values='star_rating')

# Train-test split (possivelmente deviamos por no training os mais antigos para prever o futuro)
train, test = train_test_split(df, test_size=0.30, random_state=42)
test = test[test.customer_id.isin(train.customer_id)]

df_train_pivot = apply_pivot(df = train, fillby = 0)
df_test_pivot = apply_pivot(df = test, fillby = 0)

# Para ver se existem valores não NaN
#print(df_train_pivot.isna().sum())


In [72]:
# Creating dummies
# Train
dummy_train = train.copy()

# Replace non-numeric values in 'star_rating' column with NaN (possivelmente podia-se substituir pela média ou assim e não por 0)
dummy_train['star_rating'] = pd.to_numeric(dummy_train['star_rating'], errors='coerce')

# Exclude products already rated by the user
dummy_train['star_rating'] = dummy_train['star_rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_train = apply_pivot(df = dummy_train, fillby = 1)

# Test
dummy_test = test.copy()
dummy_test['star_rating'] = dummy_test['star_rating'].apply(lambda x: 1 if x>=1 else 0)
dummy_test = apply_pivot(df = dummy_test, fillby = 0)

In [73]:
# To calculate mean, use only ratings given by user instead of fillna by 0 as it increases denominator in mean
mean = np.nanmean(apply_pivot(df = train), axis = 1)
df_train_subtracted = (apply_pivot(df = train).T-mean).T

# Make rating=0 where user hasn't given any rating
df_train_subtracted.fillna(0, inplace = True)

print(df_train_subtracted.head())

product_id   B000JLNHO6  B000YMNI2Q  B000YMNI76  B000YMR5X4  B000YMR61A  \
customer_id                                                               
10796               0.0         0.0         0.0         0.0         0.0   
16301               0.0         0.0         0.0         0.0         0.0   
16575               0.0         0.0         0.0         0.0         0.0   
18621               0.0         0.0         0.0         0.0         0.0   
25564               0.0         0.0         0.0         0.0         0.0   

product_id   B000YMR6AG  B000YMRM8W  B00194DRWY  B00194DS1Y  B00194GSSY  ...  \
customer_id                                                              ...   
10796               0.0         0.0         0.0         0.0         0.0  ...   
16301               0.0         0.0         0.0         0.0         0.0  ...   
16575               0.0         0.0         0.0         0.0         0.0  ...   
18621               0.0         0.0         0.0         0.0         0.0  .

In [74]:
# Creating the User Similarity Matrix using pairwise_distance function. shape of user_correlation is userXuser i.e. 67154x67154
user_correlation = 1 - pairwise_distances(df_train_subtracted, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0

# Convert the user_correlation matrix into dataframe
user_correlation_df = pd.DataFrame(user_correlation)
user_correlation_df['userId'] = df_train_subtracted.index
user_correlation_df.set_index('userId',inplace=True)
user_correlation_df.columns = df_train_subtracted.index.tolist()

print(user_correlation_df.head())

        10796     16301     16575     18621     25564     32028     36499     \
userId                                                                         
10796        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
16301        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
16575        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
18621        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
25564        0.0       0.0       0.0       0.0       0.0       0.0       0.0   

        43222     44864     54366     ...  53094263  53094662  53095002  \
userId                                ...                                 
10796        0.0       0.0       0.0  ...       0.0       0.0       0.0   
16301        0.0       0.0       0.0  ...       0.0       0.0       0.0   
16575        0.0       0.0       0.0  ...       0.0       0.0       0.0   
18621        0.0       0.0       0.0  ...       0.0       0.0   

In [75]:
# Rating predicted by the user (for rated & non rated product both) is the weighted sum of correlation with the product rating (as present in the rating dataset). 
user_predicted_ratings = np.dot(user_correlation, df_train_pivot)

# To find only product not rated by the user, ignore the product rated by the user by making it zero. 
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)

# scaler = MinMaxScaler(feature_range=(1, 5))
# scaler.fit(user_final_rating)
# user_final_rating = scaler.transform(user_final_rating)

In [78]:
def find_top_recommendations(pred_rating_df, userid, topn):
    recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]
    recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})
    return recommendation

user_input = int(input("Enter your user id"))
recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 5)
recommendation_user_user['customer_id'] = user_input

print("Recommended products for user id:{} as below".format(user_input))
display(recommendation_user_user)
print("Earlier rated products by user id:{} as below".format(user_input))
display(train[train['customer_id']==user_input].sort_values(['star_rating'],ascending=False))

KeyError: 46098046

In [79]:
#Filter user correlation only for user which is in test, test is subset/equal of train in terms of userId

user_correlation_test_df = user_correlation_df[user_correlation_df.index.isin(test.customer_id)]
user_correlation_test_df = user_correlation_test_df[list(set(test.customer_id))]
# user_correlation_test_df[user_correlation_test_df<0]=0

#Get test user predicted rating
test_user_predicted_ratings = np.dot(user_correlation_test_df, df_test_pivot)
test_user_predicted_ratings = np.multiply(test_user_predicted_ratings,dummy_test)
#Get NaN where user never rated as it shouldn't contribute in calculating RMSE
test_user_predicted_ratings = test_user_predicted_ratings[test_user_predicted_ratings > 0]
scaler = MinMaxScaler(feature_range=(1, 5))
scaler.fit(test_user_predicted_ratings)
test_user_predicted_ratings = scaler.transform(test_user_predicted_ratings)

total_non_nan = np.count_nonzero(~np.isnan(test_user_predicted_ratings))
rmse = (np.sum(np.sum((apply_pivot(df = test) - test_user_predicted_ratings)**2))/total_non_nan)**0.5
print(rmse)

nan


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  rmse = (np.sum(np.sum((apply_pivot(df = test) - test_user_predicted_ratings)**2))/total_non_nan)**0.5


In [80]:
pickle.dump(user_final_rating,open('./user_final_rating.pkl','wb'))