In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

# dataset is accessible at https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt (https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz)
df = pd.read_csv('amazon_reviews_us_Digital_Software_v1_00.tsv', sep='\t', dtype={'star_rating': float})

df.columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating',
              'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']

df = df[:30000]

print(df.dtypes)

marketplace           object
customer_id            int64
review_id             object
product_id            object
product_parent         int64
product_title         object
product_category      object
star_rating          float64
helpful_votes          int64
total_votes            int64
vine                  object
verified_purchase     object
review_headline       object
review_body           object
review_date           object
dtype: object


In [17]:
def apply_pivot(df,fillby = None):
    if fillby is not None:
        return df.pivot_table(index='customer_id', columns='product_id',values='star_rating').fillna(fillby)
    return df.pivot_table(index='customer_id', columns='product_id', values='star_rating')

# Train-test split (possivelmente deviamos por no training os mais antigos para prever o futuro)
train, test = train_test_split(df, test_size=0.30, random_state=42)
test = test[test.customer_id.isin(train.customer_id)]

# Check for duplicate customer_id - product_id pairs
#print(train.duplicated(['customer_id', 'product_id']).sum())

# Eliminate duplicates
train = train.drop_duplicates(['customer_id', 'product_id'])

df_train_pivot = apply_pivot(df = train, fillby = 0)
df_test_pivot = apply_pivot(df = test, fillby = 0)

# Para ver se existem valores não NaN
#print(df_train_pivot.isna().sum())


In [18]:
# Creating dummies
# Train
dummy_train = train.copy()

# Replace non-numeric values in 'star_rating' column with NaN (possivelmente podia-se substituir pela média ou assim e não por 0)
dummy_train['star_rating'] = pd.to_numeric(dummy_train['star_rating'], errors='coerce')

# Exclude products already rated by the user
dummy_train['star_rating'] = dummy_train['star_rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_train = apply_pivot(df = dummy_train, fillby = 1)

# Test
dummy_test = test.copy()
dummy_test['star_rating'] = dummy_test['star_rating'].apply(lambda x: 1 if x>=1 else 0)
dummy_test = apply_pivot(df = dummy_test, fillby = 0)

In [24]:
# To calculate mean, use only ratings given by user instead of fillna by 0 as it increases denominator in mean
mean = np.nanmean(apply_pivot(df = train), axis = 1)
df_train_subtracted = (apply_pivot(df = train).T-mean).T

# Make rating=0 where user hasn't given any rating
df_train_subtracted.fillna(0, inplace = True)

print(df_train_subtracted.head())

product_id   B000JLNHO6  B000YMNI2Q  B001M4JFN6  B001M4JFOA  B001M4JFT0  \
customer_id                                                               
11720               0.0         0.0         0.0         0.0         0.0   
27364               0.0         0.0         0.0         0.0         0.0   
32421               0.0         0.0         0.0         0.0         0.0   
41939               0.0         0.0         0.0         0.0         0.0   
49745               0.0         0.0         0.0         0.0         0.0   

product_id   B001M4JFTA  B001M4JFTU  B001O5CHVU  B001UHMTP6  B001V9K91Y  ...  \
customer_id                                                              ...   
11720               0.0         0.0         0.0         0.0         0.0  ...   
27364               0.0         0.0         0.0         0.0         0.0  ...   
32421               0.0         0.0         0.0         0.0         0.0  ...   
41939               0.0         0.0         0.0         0.0         0.0  .

In [22]:
# Creating the User Similarity Matrix using pairwise_distance function. shape of user_correlation is userXuser i.e. 67154x67154
user_correlation = 1 - pairwise_distances(df_train_subtracted, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0

# Convert the user_correlation matrix into dataframe
user_correlation_df = pd.DataFrame(user_correlation)
user_correlation_df['userId'] = df_train_subtracted.index
user_correlation_df.set_index('userId',inplace=True)
user_correlation_df.columns = df_train_subtracted.index.tolist()

print(user_correlation_df.head())

        11720     27364     32421     41939     49745     50861     51023     \
userId                                                                         
11720        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
27364        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
32421        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
41939        0.0       0.0       0.0       0.0       0.0       0.0       0.0   
49745        0.0       0.0       0.0       0.0       0.0       0.0       0.0   

        53264     54347     63486     ...  53091882  53092120  53092986  \
userId                                ...                                 
11720        0.0       0.0       0.0  ...       0.0       0.0       0.0   
27364        0.0       0.0       0.0  ...       0.0       0.0       0.0   
32421        0.0       0.0       0.0  ...       0.0       0.0       0.0   
41939        0.0       0.0       0.0  ...       0.0       0.0   

In [31]:
# Rating predicted by the user (for rated & non rated product both) is the weighted sum of correlation with the product rating (as present in the rating dataset). 
user_predicted_ratings = np.dot(user_correlation, df_train_pivot)

# To find only product not rated by the user, ignore the product rated by the user by making it zero. 
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)

# scaler = MinMaxScaler(feature_range=(1, 5))
# scaler.fit(user_final_rating)
# user_final_rating = scaler.transform(user_final_rating)

product_id
B000JLNHO6    float64
B000YMNI2Q    float64
B001M4JFN6    float64
B001M4JFOA    float64
B001M4JFT0    float64
               ...   
B012X77XPM    float64
B012Y7R126    float64
B01349UVFW    float64
B013VG6UUS    float64
B013YHE73G    float64
Length: 1312, dtype: object


In [40]:
def find_top_recommendations(pred_rating_df, userid, topn):
    recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]
    recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})
    return recommendation

user_input = int(input("Enter your user id"))
recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 5)
recommendation_user_user['customer_id'] = user_input

print("Earlier rated products by user id:{} as below".format(user_input))
display(train[train['customer_id']==user_input].sort_values(['star_rating'],ascending=False))
print("Earlier rated products by user id:{} as below".format(user_input))
display(train[train['customer_id']==user_input].sort_values(['star_rating'],ascending=False))

Earlier rated products by user id:49745 as below


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2535,US,49745,R2XXVMORZ4VLSG,B00H9A60O4,608720080,Avast Free Antivirus 2015 [Download],Digital_Software,5.0,0,0,N,Y,best anti virus you can ever use,I tried several anti virus applications . But ...,2015-07-31


Earlier rated products by user id:49745 as below


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2535,US,49745,R2XXVMORZ4VLSG,B00H9A60O4,608720080,Avast Free Antivirus 2015 [Download],Digital_Software,5.0,0,0,N,Y,best anti virus you can ever use,I tried several anti virus applications . But ...,2015-07-31
