In [75]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate
from surprise import SVD

In [76]:
phone = pd.read_csv('merged.csv')
phone.head()                

Unnamed: 0,Image_Url,Item_Name,Username,Rating_Score,Review_Description,Review_Date
0,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Leonardo,5,Tengo prácticamente una semana con el equipo y...,Reviewed in Mexico on 2 December 2020
1,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Jacob E,3,Coming from a pixel 2 I expected more.Front ca...,Reviewed in the United States on 5 October 2020
2,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Angel Chavez,5,Realmente llegó el que pedí' aunque aún no hay...,Reviewed in Mexico on 25 November 2020
3,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Shawn,3,I decided on the Samsung Galaxy S20 FE for my ...,Reviewed in the United States on 18 November 2020
4,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Morgan J.,4,El teléfono llegó super rápido y lo compré por...,Reviewed in Mexico on 9 January 2021


In [77]:
len(phone) 

1035

In [78]:
print(phone.nunique())

Image_Url                2
Item_Name                2
Username               951
Rating_Score             5
Review_Description    1025
Review_Date            553
dtype: int64


In [79]:
phone['Rating_Score'].value_counts()

5    608
4    151
1    149
3     71
2     56
Name: Rating_Score, dtype: int64

In [80]:
print(phone.isnull().values.any())

True


In [81]:
print(phone.isnull().sum())

Image_Url             0
Item_Name             0
Username              0
Rating_Score          0
Review_Description    1
Review_Date           0
dtype: int64


In [82]:
df = phone.drop_duplicates('Username', keep='first')
len(df)

951

In [83]:
#Assign X as the original ratings dataframe and y as the Username column of ratings.

X = df.copy()
y = df['Username']

#Split into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [84]:
df_ratings = X_train.pivot(index='Username', columns='Item_Name', values='Rating_Score')
df_ratings

Item_Name,Samsung Galaxy S20 FE 5G | Factory Unlocked Android Cell Phone | 128 GB | US Version Smartphone | Pro-Grade Camera| 30X Space Zoom| Night Mode | Cloud Navy,"UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free Dual 4G Smartphone 16MP+8MP+5MP Camera Smartphones 4150mAh Battery 6.3"" FHD+ 32GB ROM 4GB RAM Android 9 Pie (Grey)"
Username,Unnamed: 1_level_1,Unnamed: 2_level_1
....,4.0,
2kold,3.0,
4u1solo,5.0,
A G Trappe,,1.0
A S M SARWAR ZAHAN,5.0,
...,...,...
warang,5.0,
willie hines,5.0,
wilsonmd,5.0,
winton,5.0,


In [85]:
# replace the NULL values by 0s since the cosine_similarity doesn’t work with NA values
df_ratings_dummy = df_ratings.copy().fillna(0)
df_ratings_dummy

Item_Name,Samsung Galaxy S20 FE 5G | Factory Unlocked Android Cell Phone | 128 GB | US Version Smartphone | Pro-Grade Camera| 30X Space Zoom| Night Mode | Cloud Navy,"UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free Dual 4G Smartphone 16MP+8MP+5MP Camera Smartphones 4150mAh Battery 6.3"" FHD+ 32GB ROM 4GB RAM Android 9 Pie (Grey)"
Username,Unnamed: 1_level_1,Unnamed: 2_level_1
....,4.0,0.0
2kold,3.0,0.0
4u1solo,5.0,0.0
A G Trappe,0.0,1.0
A S M SARWAR ZAHAN,5.0,0.0
...,...,...
warang,5.0,0.0
willie hines,5.0,0.0
wilsonmd,5.0,0.0
winton,5.0,0.0


In [86]:
#cosine similarity of the ratings
similarity_matrix = cosine_similarity(df_ratings_dummy, df_ratings_dummy)
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=df_ratings.index, columns=df_ratings.index)

#calculate ratings using weighted sum of cosine similarity
#function to calculate ratings
def calculate_ratings(item, username):
    if item in df_ratings:
        cosine_scores = similarity_matrix_df[username] #similarity of user with every other users
        ratings_scores = df_ratings[item]      #ratings of every other users for the item
        
        #won't consider users who havent rated item so drop similarity scores and ratings corresponsing to np.nan
        index_not_rated = ratings_scores[ratings_scores.isnull()].index
        ratings_scores = ratings_scores.dropna()
        cosine_scores = cosine_scores.drop(index_not_rated)
        
        #calculating rating by weighted mean of ratings and cosine scores of the users who have rated the item
        ratings_item = np.dot(ratings_scores, cosine_scores)/cosine_scores.sum()
        
    else:
        return 2.5
    return ratings_item

In [87]:
calculate_ratings('Samsung Galaxy S20 FE 5G | Factory Unlocked Android Cell Phone | 128 GB | US Version Smartphone | Pro-Grade Camera| 30X Space Zoom| Night Mode | Cloud Navy', 'Leonardo')

4.028315946348733

In [88]:
#evaluates on test set

def score_on_test_set():
    user_item_pairs = zip(X_test['Item_Name'], X_test['Username'])
    
    predicted_ratings = np.array([calculate_ratings(x,x) for (x,x) in user_item_pairs])
    true_ratings = np.array(X_test['Rating_Score'])
    score = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    return score
test_set_score = score_on_test_set()
print(test_set_score)

2.0717183222442883


In [89]:
df

Unnamed: 0,Image_Url,Item_Name,Username,Rating_Score,Review_Description,Review_Date
0,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Leonardo,5,Tengo prácticamente una semana con el equipo y...,Reviewed in Mexico on 2 December 2020
1,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Jacob E,3,Coming from a pixel 2 I expected more.Front ca...,Reviewed in the United States on 5 October 2020
2,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Angel Chavez,5,Realmente llegó el que pedí' aunque aún no hay...,Reviewed in Mexico on 25 November 2020
3,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Shawn,3,I decided on the Samsung Galaxy S20 FE for my ...,Reviewed in the United States on 18 November 2020
4,https://m.media-amazon.com/images/I/41UuyU7HsP...,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Morgan J.,4,El teléfono llegó super rápido y lo compré por...,Reviewed in Mexico on 9 January 2021
...,...,...,...,...,...,...
1029,https://m.media-amazon.com/images/I/419sppTfYE...,UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free...,Anderson Nunez,5,Excelent phone for the price,Reviewed in the United States on 8 January 2020
1030,https://m.media-amazon.com/images/I/419sppTfYE...,UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free...,Danny Jimenez,5,Love it!!!,Reviewed in the United States on 30 November 2019
1031,https://m.media-amazon.com/images/I/419sppTfYE...,UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free...,Svetlan Blagojevic,5,Great,Reviewed in the United States on 7 January 2020
1032,https://m.media-amazon.com/images/I/419sppTfYE...,UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free...,Derrick byrd,5,Excellent phone recommended,Reviewed in the United States on 8 November 2019


In [90]:
df = df.drop(columns=['Image_Url','Review_Description', 'Review_Date '])
df

Unnamed: 0,Item_Name,Username,Rating_Score
0,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Leonardo,5
1,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Jacob E,3
2,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Angel Chavez,5
3,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Shawn,3
4,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,Morgan J.,4
...,...,...,...
1029,UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free...,Anderson Nunez,5
1030,UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free...,Danny Jimenez,5
1031,UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free...,Svetlan Blagojevic,5
1032,UMIDIGI A5 Pro Unlocked Mobile Phones SIM Free...,Derrick byrd,5


In [91]:
#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings

reader = Reader()
#dataset creation
data = Dataset.load_from_df(df, reader)
#model
knn = KNNBasic()
#Evaluating the performance in terms of RMSE
cross_validate(knn, data, measures=['RMSE'], cv = 3)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([1.44078763, 1.44874397, 1.4978689 ]),
 'fit_time': (0.0, 0.0, 0.0010001659393310547),
 'test_time': (0.0020003318786621094,
  0.0009999275207519531,
  0.0010006427764892578)}

In [92]:
#Define the SVD algorithm object
svd = SVD()
#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE'], cv = 3)

{'test_rmse': array([1.48312742, 1.4310648 , 1.46021932]),
 'fit_time': (0.027510404586791992, 0.027006149291992188, 0.03122258186340332),
 'test_time': (0.0010001659393310547,
  0.0010001659393310547,
  0.0010006427764892578)}

In [93]:
trainset = data.build_full_trainset()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23980b5ac70>

In [115]:
#run this to display random a sample record
df2 = df.sample(1)
df2

Unnamed: 0,Item_Name,Username,Rating_Score
680,Samsung Galaxy S20 FE 5G | Factory Unlocked An...,FDabroski,5


In [116]:
#Listing the lists
item_list = df2.iloc[:, 0].unique().tolist()
user_list = df2.iloc[:, 1].unique().tolist()
rating_list = df2.iloc[:, 2].tolist()

In [117]:
#Checking the prediction score result against the original score
for x in item_list:
    for y in user_list:
        result = svd.predict(x,y).est
        for z in rating_list:
            print('Username:',y,'\nPredicted Rating Score:',result,'\nOriginal Rating Score:',z,
                  '\nPercentage:',((result/z)*100).round(2),'%')

        

Username: FDabroski 
Predicted Rating Score: 4.842184733668431 
Original Rating Score: 5 
Percentage: 96.84 %
