# Import the necessary libraries and read the provided CSVs as a data frame

In [2]:
# Import libraries
import pandas as pd
import numpy as np

# Splitting the data for training and testing out model
from sklearn.model_selection import train_test_split

#For SVD
from surprise import SVD
from surprise.model_selection import train_test_split

from collections import defaultdict

In [3]:
# merging all csv files
phone_user = pd.concat(map(pd.read_csv, ['phone_user_review_file_1.csv', 'phone_user_review_file_2.csv', 'phone_user_review_file_3.csv', 'phone_user_review_file_4.csv', 'phone_user_review_file_5.csv', 'phone_user_review_file_6.csv']), ignore_index=True)
print(phone_user)

                                 phone_url       date lang country  \
0           /cellphones/samsung-galaxy-s8/   5/2/2017   en      us   
1           /cellphones/samsung-galaxy-s8/  4/28/2017   en      us   
2           /cellphones/samsung-galaxy-s8/   5/4/2017   en      us   
3           /cellphones/samsung-galaxy-s8/   5/2/2017   en      us   
4           /cellphones/samsung-galaxy-s8/  5/11/2017   en      us   
...                                    ...        ...  ...     ...   
1415128  /cellphones/alcatel-ot-club_1187/  5/12/2000   de      de   
1415129  /cellphones/alcatel-ot-club_1187/  5/11/2000   de      de   
1415130  /cellphones/alcatel-ot-club_1187/   5/4/2000   de      de   
1415131  /cellphones/alcatel-ot-club_1187/   5/1/2000   de      de   
1415132  /cellphones/alcatel-ot-club_1187/  4/25/2000   de      de   

                   source               domain  score  score_max  \
0        Verizon Wireless  verizonwireless.com   10.0       10.0   
1             Phone Are

In [4]:
# Explore and understand data
# Shape of data
phone_user.shape

(1415133, 11)

In [5]:
phone_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1415133 entries, 0 to 1415132
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   phone_url  1415133 non-null  object 
 1   date       1415133 non-null  object 
 2   lang       1415133 non-null  object 
 3   country    1415133 non-null  object 
 4   source     1415133 non-null  object 
 5   domain     1415133 non-null  object 
 6   score      1351644 non-null  float64
 7   score_max  1351644 non-null  float64
 8   extract    1395772 non-null  object 
 9   author     1351931 non-null  object 
 10  product    1415132 non-null  object 
dtypes: float64(2), object(9)
memory usage: 118.8+ MB


In [6]:
phone_user.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
score,1351644.0,8.00706,2.616121,0.2,7.2,9.2,10.0,10.0
score_max,1351644.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0


#### Observations
Only 2 features are numerical and others are object.
We can convert the relevant one to string type for better visualization
Also score_max is 10. Therefore we can remove that feature.

In [7]:
#Round of score feature
round(phone_user.score)

0          10.0
1          10.0
2           6.0
3           9.0
4           4.0
           ... 
1415128     2.0
1415129    10.0
1415130     2.0
1415131     8.0
1415132     2.0
Name: score, Length: 1415133, dtype: float64

In [8]:
type(phone_user.score)

pandas.core.series.Series

In [9]:
# Missing valuee
phone_user.isnull().sum()

phone_url        0
date             0
lang             0
country          0
source           0
domain           0
score        63489
score_max    63489
extract      19361
author       63202
product          1
dtype: int64

In [10]:
# Drop score_max, domain and extract as their contribution won't be helping much 
phone_user = phone_user.drop(['score_max'], axis=1)
phone_user = phone_user.drop(['extract'], axis=1)
phone_user = phone_user.drop(['domain'], axis=1)

In [11]:
# Make another column with actual product
phone_user['actual_product'] =phone_user.phone_url.str[12:]
phone_user['actual_product'] =phone_user.actual_product.str[:-1]

In [12]:
# Make a different dataframe for null or missing values
phone_user_test=phone_user[phone_user.isnull().any(axis=1)]

In [13]:
phone_user_test

Unnamed: 0,phone_url,date,lang,country,source,score,author,product,actual_product
270,/cellphones/samsung-galaxy-s8/,5/4/2017,cs,cz,CZC,10.0,,"Samsung Galaxy S8, 64GB, černá",samsung-galaxy-s8
456,/cellphones/samsung-galaxy-s8/,4/25/2017,cs,cz,CZC,10.0,,"Samsung Galaxy S8, 64GB, černá",samsung-galaxy-s8
489,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,,Дмитрий Сергеев,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
490,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,,Надежда,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
491,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,,Матвей Д.,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
...,...,...,...,...,...,...,...,...,...
1415002,/cellphones/ericsson-pf-768/,1/28/2000,de,de,Dooyoo,6.0,,Ericsson PF768,ericsson-pf-768
1415004,/cellphones/ericsson-pf-768/,1/13/2000,de,de,Dooyoo,8.0,,Ericsson PF768,ericsson-pf-768
1415008,/cellphones/motorola-m3288/,3/28/2012,tr,tr,Cepworld,,burak,Motorola (391) M3288,motorola-m3288
1415009,/cellphones/motorola-m3288/,7/30/2001,it,it,Ciao,6.0,,Motorola M3288,motorola-m3288


In [14]:
# Drop all rows with NaN values
phone_user_train = phone_user.dropna()

In [15]:
phone_user_train

Unnamed: 0,phone_url,date,lang,country,source,score,author,product,actual_product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,10.0,CarolAnn35,Samsung Galaxy S8,samsung-galaxy-s8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,10.0,james0923,Samsung Galaxy S8,samsung-galaxy-s8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,6.0,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl...",samsung-galaxy-s8
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,9.2,Buster2020,Samsung Galaxy S8 64GB (AT&T),samsung-galaxy-s8
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,4.0,S Ate Mine,Samsung Galaxy S8,samsung-galaxy-s8
...,...,...,...,...,...,...,...,...,...
1415128,/cellphones/alcatel-ot-club_1187/,5/12/2000,de,de,Ciao,2.0,david.paul,Alcatel Club Plus Handy,alcatel-ot-club_1187
1415129,/cellphones/alcatel-ot-club_1187/,5/11/2000,de,de,Ciao,10.0,Christiane14,Alcatel Club Plus Handy,alcatel-ot-club_1187
1415130,/cellphones/alcatel-ot-club_1187/,5/4/2000,de,de,Ciao,2.0,michaelawr,Alcatel Club Plus Handy,alcatel-ot-club_1187
1415131,/cellphones/alcatel-ot-club_1187/,5/1/2000,de,de,Ciao,8.0,claudia0815,Alcatel Club Plus Handy,alcatel-ot-club_1187


In [16]:
phone_user_train.isnull().sum()

phone_url         0
date              0
lang              0
country           0
source            0
score             0
author            0
product           0
actual_product    0
dtype: int64

In [17]:
phone_user_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1291038 entries, 0 to 1415132
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   phone_url       1291038 non-null  object 
 1   date            1291038 non-null  object 
 2   lang            1291038 non-null  object 
 3   country         1291038 non-null  object 
 4   source          1291038 non-null  object 
 5   score           1291038 non-null  float64
 6   author          1291038 non-null  object 
 7   product         1291038 non-null  object 
 8   actual_product  1291038 non-null  object 
dtypes: float64(1), object(8)
memory usage: 98.5+ MB


In [18]:
#Convert score into integer
phone_user_train['score'] = phone_user_train['score'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_train['score'] = phone_user_train['score'].astype(int)


In [19]:
#Convert other's to string
phone_user_train['phone_url'] = phone_user_train['phone_url'].astype("string")
phone_user_train['date'] = phone_user_train['date'].astype("string")
phone_user_train['lang'] = phone_user_train['lang'].astype("string")
phone_user_train['country'] = phone_user_train['country'].astype("string")
phone_user_train['source'] = phone_user_train['source'].astype("string")
phone_user_train['author'] = phone_user_train['author'].astype("string")
phone_user_train['product'] = phone_user_train['product'].astype("string")
phone_user_train['actual_product'] = phone_user_train['actual_product'].astype("string")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_train['phone_url'] = phone_user_train['phone_url'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_train['date'] = phone_user_train['date'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_train['lang'] = phone_user_train['lang'].astyp

In [20]:
phone_user_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1291038 entries, 0 to 1415132
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   phone_url       1291038 non-null  string
 1   date            1291038 non-null  string
 2   lang            1291038 non-null  string
 3   country         1291038 non-null  string
 4   source          1291038 non-null  string
 5   score           1291038 non-null  int32 
 6   author          1291038 non-null  string
 7   product         1291038 non-null  string
 8   actual_product  1291038 non-null  string
dtypes: int32(1), string(8)
memory usage: 93.6 MB


In [21]:
# Change the dtype of NaN data as well - Used for testing
phone_user_test['score'] = phone_user_test['score'].fillna(0)
phone_user_test['score'] = phone_user_test['score'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_test['score'] = phone_user_test['score'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_test['score'] = phone_user_test['score'].astype(int)


In [22]:
#Convert other's to string
phone_user_test['phone_url'] = phone_user_test['phone_url'].astype("string")
phone_user_test['date'] = phone_user_test['date'].astype("string")
phone_user_test['lang'] = phone_user_test['lang'].astype("string")
phone_user_test['country'] = phone_user_test['country'].astype("string")
phone_user_test['source'] = phone_user_test['source'].astype("string")
phone_user_test['author'] = phone_user_test['author'].astype("string")
phone_user_test['product'] = phone_user_test['product'].astype("string")
phone_user_test['actual_product'] = phone_user_test['actual_product'].astype("string")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_test['phone_url'] = phone_user_test['phone_url'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_test['date'] = phone_user_test['date'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_user_test['lang'] = phone_user_test['lang'].astype("str

In [23]:
phone_user_test

Unnamed: 0,phone_url,date,lang,country,source,score,author,product,actual_product
270,/cellphones/samsung-galaxy-s8/,5/4/2017,cs,cz,CZC,10,,"Samsung Galaxy S8, 64GB, černá",samsung-galaxy-s8
456,/cellphones/samsung-galaxy-s8/,4/25/2017,cs,cz,CZC,10,,"Samsung Galaxy S8, 64GB, černá",samsung-galaxy-s8
489,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,0,Дмитрий Сергеев,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
490,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,0,Надежда,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
491,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,0,Матвей Д.,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
...,...,...,...,...,...,...,...,...,...
1415002,/cellphones/ericsson-pf-768/,1/28/2000,de,de,Dooyoo,6,,Ericsson PF768,ericsson-pf-768
1415004,/cellphones/ericsson-pf-768/,1/13/2000,de,de,Dooyoo,8,,Ericsson PF768,ericsson-pf-768
1415008,/cellphones/motorola-m3288/,3/28/2012,tr,tr,Cepworld,0,burak,Motorola (391) M3288,motorola-m3288
1415009,/cellphones/motorola-m3288/,7/30/2001,it,it,Ciao,6,,Motorola M3288,motorola-m3288


In [24]:
phone_user_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124095 entries, 270 to 1415096
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   phone_url       124095 non-null  string
 1   date            124095 non-null  string
 2   lang            124095 non-null  string
 3   country         124095 non-null  string
 4   source          124095 non-null  string
 5   score           124095 non-null  int32 
 6   author          60893 non-null   string
 7   product         124094 non-null  string
 8   actual_product  124095 non-null  string
dtypes: int32(1), string(8)
memory usage: 9.0 MB


In [25]:
#Remove duplicates
phone_user_train.drop_duplicates()

Unnamed: 0,phone_url,date,lang,country,source,score,author,product,actual_product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,10,CarolAnn35,Samsung Galaxy S8,samsung-galaxy-s8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,10,james0923,Samsung Galaxy S8,samsung-galaxy-s8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,6,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl...",samsung-galaxy-s8
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,9,Buster2020,Samsung Galaxy S8 64GB (AT&T),samsung-galaxy-s8
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,4,S Ate Mine,Samsung Galaxy S8,samsung-galaxy-s8
...,...,...,...,...,...,...,...,...,...
1415128,/cellphones/alcatel-ot-club_1187/,5/12/2000,de,de,Ciao,2,david.paul,Alcatel Club Plus Handy,alcatel-ot-club_1187
1415129,/cellphones/alcatel-ot-club_1187/,5/11/2000,de,de,Ciao,10,Christiane14,Alcatel Club Plus Handy,alcatel-ot-club_1187
1415130,/cellphones/alcatel-ot-club_1187/,5/4/2000,de,de,Ciao,2,michaelawr,Alcatel Club Plus Handy,alcatel-ot-club_1187
1415131,/cellphones/alcatel-ot-club_1187/,5/1/2000,de,de,Ciao,8,claudia0815,Alcatel Club Plus Handy,alcatel-ot-club_1187


In [26]:
phone_user_test.drop_duplicates()

Unnamed: 0,phone_url,date,lang,country,source,score,author,product,actual_product
270,/cellphones/samsung-galaxy-s8/,5/4/2017,cs,cz,CZC,10,,"Samsung Galaxy S8, 64GB, černá",samsung-galaxy-s8
456,/cellphones/samsung-galaxy-s8/,4/25/2017,cs,cz,CZC,10,,"Samsung Galaxy S8, 64GB, černá",samsung-galaxy-s8
489,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,0,Дмитрий Сергеев,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
490,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,0,Надежда,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
491,/cellphones/samsung-galaxy-s8/,4/15/2017,ru,ru,Связной,0,Матвей Д.,Samsung Galaxy S8 (желтый топаз),samsung-galaxy-s8
...,...,...,...,...,...,...,...,...,...
1415002,/cellphones/ericsson-pf-768/,1/28/2000,de,de,Dooyoo,6,,Ericsson PF768,ericsson-pf-768
1415004,/cellphones/ericsson-pf-768/,1/13/2000,de,de,Dooyoo,8,,Ericsson PF768,ericsson-pf-768
1415008,/cellphones/motorola-m3288/,3/28/2012,tr,tr,Cepworld,0,burak,Motorola (391) M3288,motorola-m3288
1415009,/cellphones/motorola-m3288/,7/30/2001,it,it,Ciao,6,,Motorola M3288,motorola-m3288


Now we have 2 datasets. One is for training purpose with non-null values for all the columns of all the rows. Other dataset i.e test dataset is having null values for one or more columns for one row.

Training dataset :1228734
Test dataset :112969

In [27]:
# Drop Irrelevant features
phone_user_train=phone_user_train.drop(['product','phone_url','lang'],axis=1)

In [28]:
X=phone_user_train.drop(['actual_product'],axis=1)
Y=phone_user_train['actual_product']

In [29]:
phone_user_train

Unnamed: 0,date,country,source,score,author,actual_product
0,5/2/2017,us,Verizon Wireless,10,CarolAnn35,samsung-galaxy-s8
1,4/28/2017,us,Phone Arena,10,james0923,samsung-galaxy-s8
2,5/4/2017,us,Amazon,6,R. Craig,samsung-galaxy-s8
3,5/2/2017,us,Samsung,9,Buster2020,samsung-galaxy-s8
4,5/11/2017,us,Verizon Wireless,4,S Ate Mine,samsung-galaxy-s8
...,...,...,...,...,...,...
1415128,5/12/2000,de,Ciao,2,david.paul,alcatel-ot-club_1187
1415129,5/11/2000,de,Ciao,10,Christiane14,alcatel-ot-club_1187
1415130,5/4/2000,de,Ciao,2,michaelawr,alcatel-ot-club_1187
1415131,5/1/2000,de,Ciao,8,claudia0815,alcatel-ot-club_1187


In [30]:
phone_user_train=phone_user_train.sample(n = 1000000)

In [31]:
phone_user_train.shape

(1000000, 6)

Now the training dataset have approximate 1 million records

# Answer the following questions
A. Identify the most rated products 
B. Identify the users with most number of reviews
C. c. Report the shape of the final
dataset

In [32]:
# Identify most rated products
most_rated_products=pd.DataFrame(phone_user_train.groupby('actual_product')['score'].count().sort_values(ascending=False))
print(most_rated_products)

                            score
actual_product                   
samsung-galaxy-s6           12351
apple-iphone-5s             12098
samsung-galaxy-s7-edge      12053
samsung-galaxy-s5           11950
samsung-galaxy-s-iii        11111
...                           ...
samsung-sph-a660                1
swissvoice-mp40                 1
kyocera-1135                    1
samsung-sch-3500                1
archos-3-5-internet-tablet      1

[5472 rows x 1 columns]


In [33]:
# Identify users with most number of reviews
users_with_most_reviews=pd.DataFrame(phone_user_train.groupby('author')['score'].count().sort_values(ascending=False))
print(users_with_most_reviews)

                                                    score
author                                                   
Amazon Customer                                     59642
Cliente Amazon                                      14923
e-bit                                                6723
Client d'Amazon                                      5956
Amazon Kunde                                         3709
...                                                   ...
Lucinda_Burris_M                                        1
LucindaR                                                1
Lucinda Martins                                         1
Lucinda Linde "Lucrezia"                                1
��������� X�� �������_���������_������'m ������...      1

[625161 rows x 1 columns]


In [34]:
# Select the data with products having more than 50 ratings and users who have given more than 50 ratings
users_with_most_reviews=users_with_most_reviews.loc[users_with_most_reviews['score']>50]
print(users_with_most_reviews)

                 score
author                
Amazon Customer  59642
Cliente Amazon   14923
e-bit             6723
Client d'Amazon   5956
Amazon Kunde      3709
...                ...
federico            51
nicola              51
Monique             51
streghe_1965        51
Veronica            51

[635 rows x 1 columns]


In [35]:
most_rated_products=most_rated_products.loc[most_rated_products['score']>50]
print(most_rated_products)

                        score
actual_product               
samsung-galaxy-s6       12351
apple-iphone-5s         12098
samsung-galaxy-s7-edge  12053
samsung-galaxy-s5       11950
samsung-galaxy-s-iii    11111
...                       ...
sony-ericsson-z320i        51
motorola-w370              51
sony-xperia-neo-l          51
sonim-xp3300-force         51
lg-c1300-g4015             51

[2191 rows x 1 columns]



# Build popularity based model

In [36]:
score_mean_count=pd.DataFrame(phone_user_train.groupby('actual_product')['score'].mean())

In [37]:
score_mean_count['score_counts'] = pd.DataFrame(phone_user_train.groupby('actual_product')['score'].count())  

In [38]:
score_mean_count=score_mean_count.sort_values(['score_counts'], ascending=[False])

In [39]:
#Recommemnded below top 5 mobile phones
score_mean_count.head()

Unnamed: 0_level_0,score,score_counts
actual_product,Unnamed: 1_level_1,Unnamed: 2_level_1
samsung-galaxy-s6,8.566594,12351
apple-iphone-5s,8.304183,12098
samsung-galaxy-s7-edge,8.811748,12053
samsung-galaxy-s5,8.245523,11950
samsung-galaxy-s-iii,8.165602,11111


The top 5 recommendations using popularity based recommended systems are 1) samsung-galaxy-s6 with average score of 8.57 given by 15901 people , 2) apple-iphone-5s with average score of 8.29 given by 15614 people , 3) samsung-galaxy-s7-edge with average score of 8.81 given by 15595 people , 4) samsung-galaxy-s5 with average score of 8.27 given by 15426 people , 5) motorola-moto-g with average score of 8.91 given by 14433 people 

# Build collaborative filtering model using SVD

In [40]:
phone_user_train_col_svd=phone_user_train.drop(['date','country','source'],axis=1)

In [41]:
phone_user_train_col_svd

Unnamed: 0,score,author,actual_product
1236615,8,tfvukm,nokia-5310
605546,4,Sondrol,blu-dash-jr-w
1037227,10,everything s perfect but camer,nokia-x2
274236,10,Gianluca P.,microsoft-lumia-950
352108,10,Christophe,samsung-galaxy-s5
...,...,...,...
1184089,8,RUBENS.NARCISO,lg-kc910-196322
925419,8,beauvoir,htc-windows-phone-8
436790,8,Rocky212,lg-g2-594708
1047886,2,Nick,caterpillar-b10


In [42]:
from surprise import Dataset,Reader
from surprise import SVD
from surprise import accuracy

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(phone_user_train_col_svd[['author', 'actual_product', 'score']], reader)

In [43]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

In [44]:
# Print first author
print(trainset.to_raw_uid(0))

Rodrigo dos Santos11


In [45]:
# Print first author
print(trainset.to_raw_iid(0))

samsung-galaxy-s-iii


In [46]:
# Total number of authors
trainset.n_users

485580

In [47]:
# Total number of actual product
trainset.n_items

5397

In [48]:
trainset.ur

defaultdict(list,
            {0: [(0, 10.0), (0, 10.0), (0, 10.0)],
             1: [(1, 10.0), (1317, 10.0)],
             2: [(2, 8.0), (350, 8.0), (582, 10.0)],
             3: [(3, 10.0),
              (39, 10.0),
              (76, 10.0),
              (417, 10.0),
              (5, 10.0),
              (149, 10.0),
              (5, 10.0),
              (371, 8.0),
              (467, 8.0),
              (116, 10.0),
              (378, 10.0),
              (188, 10.0),
              (116, 10.0),
              (722, 10.0),
              (285, 10.0),
              (355, 10.0),
              (467, 10.0),
              (119, 10.0),
              (3, 10.0),
              (371, 10.0),
              (611, 10.0),
              (116, 2.0),
              (1400, 10.0),
              (39, 8.0),
              (985, 8.0),
              (413, 8.0),
              (285, 10.0),
              (941, 10.0),
              (91, 10.0),
              (95, 10.0),
              (39, 6.0),
              (

In [49]:
trainset.all_ratings()

<generator object Trainset.all_ratings at 0x00000230E2F7D120>

In [50]:
# collaborative filtering using SVD
svd_model = SVD(n_factors=5,biased=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x230e2f79850>

In [51]:
testset[0]

('Dirk Beyler', 'doro-phone-easy-615', 2.0)

In [52]:
test_pred = svd_model.test(testset)

In [53]:
# Compute RSME of collaborative filtering using SVD
accuracy.rmse(test_pred)

RMSE: 2.7522


2.752193164674596

In [54]:
test_pred

[Prediction(uid='Dirk Beyler', iid='doro-phone-easy-615', r_ui=2.0, est=7.99298, details={'was_impossible': True, 'reason': 'User and item are unknown.'}),
 Prediction(uid='Dominic Lindner', iid='sony-ericsson-t303', r_ui=10.0, est=7.99298, details={'was_impossible': True, 'reason': 'User and item are unknown.'}),
 Prediction(uid='Unclevis', iid='nokia-x3', r_ui=10.0, est=7.99298, details={'was_impossible': True, 'reason': 'User and item are unknown.'}),
 Prediction(uid='Promise Ufomadu', iid='apple-iphone-6', r_ui=10.0, est=9.875155506867689, details={'was_impossible': False}),
 Prediction(uid='Robert Schrammel', iid='huawei-honor-holly', r_ui=8.0, est=7.99298, details={'was_impossible': True, 'reason': 'User and item are unknown.'}),
 Prediction(uid='TYentz ', iid='samsung-galaxy-s5', r_ui=4.0, est=7.99298, details={'was_impossible': True, 'reason': 'User and item are unknown.'}),
 Prediction(uid='Nij', iid='motorola-xt1225', r_ui=10.0, est=7.99298, details={'was_impossible': True, '

In [55]:
def get_top_n(test_pred, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in test_pred:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [56]:
top_n = get_top_n(test_pred, n=10)

In [57]:
# Recommending top  products for test users
top_n

defaultdict(list,
            {'Dirk Beyler': [('doro-phone-easy-615', 7.99298)],
             'Dominic Lindner': [('sony-ericsson-t303', 7.99298)],
             'Unclevis': [('nokia-x3', 7.99298)],
             'Promise Ufomadu': [('apple-iphone-6', 9.875155506867689)],
             'Robert Schrammel': [('huawei-honor-holly', 7.99298)],
             'TYentz ': [('samsung-galaxy-s5', 7.99298)],
             'Nij': [('motorola-xt1225', 7.99298)],
             'drone': [('motorola-atrix-2', 7.99298)],
             'desiree': [('samsung-galaxy-s-iii', 10)],
             'Wim Bierman': [('sony-xperia-s', 7.99298)],
             'Amazon Customer': [('oneplus-3', 10),
              ('oneplus-3', 10),
              ('oneplus-3', 10),
              ('oneplus-3', 10),
              ('oneplus-3', 10),
              ('oneplus-3', 10),
              ('oneplus-3', 10),
              ('oneplus-3', 10),
              ('oneplus-3', 10),
              ('oneplus-3', 10)],
             'andrenipkow': [('

In [58]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Dirk Beyler ['doro-phone-easy-615']
Dominic Lindner ['sony-ericsson-t303']
Unclevis ['nokia-x3']
Promise Ufomadu ['apple-iphone-6']
Robert Schrammel ['huawei-honor-holly']
TYentz  ['samsung-galaxy-s5']
Nij ['motorola-xt1225']
drone ['motorola-atrix-2']
desiree ['samsung-galaxy-s-iii']
Wim Bierman ['sony-xperia-s']
Amazon Customer ['oneplus-3', 'oneplus-3', 'oneplus-3', 'oneplus-3', 'oneplus-3', 'oneplus-3', 'oneplus-3', 'oneplus-3', 'oneplus-3', 'oneplus-3']
andrenipkow ['motorola-v3688']
ACHU17 ['blackberry-curve-8520']
Franceler ['sony-xperia-c']
anyelo_vper ['samsung-galaxy-s-iii-neo-i9300i']
MR L ['samsung-galaxy-s-iii']
JayCom ['siemens-s25']
juan29 ['samsung-galaxy-note-4']
kovdog  ['samsung-galaxy-s6']
HENRI ['samsung-omnialite-b7300']
Sam Brilla ['huawei-impulse-4g']
mahesh kumar ['motorola-moto-z-play']
Anonymous  ['samsung-galaxy-s7-789999', 'samsung-galaxy-s5', 'samsung-galaxy-s7-789999', 'samsung-galaxy-s5', 'samsung-galaxy-s5', 'samsung-galaxy-s7-active', 'samsung-galaxy-s

# Build a collaborative filtering model using kNNWithMeans

In [59]:
from surprise import KNNWithMeans
from surprise import accuracy

In [60]:
# Item Based
knn_model = KNNWithMeans(k=10, sim_options={ 'user_based': False})

In [61]:
knn_model.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x230bef2a220>

In [62]:
test_pred=knn_model.test(testset)

In [63]:
# Compute RSME of collaborative filtering using KNNWithMeans 
print(accuracy.rmse(test_pred))

RMSE: 2.6565
2.6565432043203683


In [64]:
# Top n recommendations for user = Amazon Customer
uid = "Amazon Customer"  # raw user id (as in the ratings file). They are **strings**!
iid = "lenovo-vibe-s1"  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = knn_model.predict(uid, iid, r_ui=0.0, verbose=True)

user: Amazon Customer item: lenovo-vibe-s1 r_ui = 0.00   est = 7.20   {'actual_k': 10, 'was_impossible': False}


In [65]:
pred = pd.DataFrame(test_pred)
pred[pred['uid'] == "Amazon Customer"][['iid', 'r_ui','est']].sort_values(by = 'r_ui',ascending = False).head(5)

Unnamed: 0,iid,r_ui,est
10,apple-iphone-5s,10.0,7.8
134168,blu-vivo-air,10.0,7.4
134657,htc-desire-820,10.0,4.4
134500,apple-iphone-se,10.0,5.6
134498,samsung-galaxy-s6,10.0,6.8


10. Popularity based recommended systems must be used :
1) when we are at the initial stage of any online website launching. When we do not have any data about the user and try to capture the data based on the clicks he/she makes on the website. 
2) When our website is not having enough data to build on the characterstics of user or item.
3) When we are not building any personlized recommended system.
Instead of zero recommendations , it is always better to keep the popular recommendations on the first page, so that it attracts the crowd

11. Collaborative based recommended systems has been used:
    1) When we have characteristics about users and items.Such recommended systems will recommend items based on the simlilar properties of neighbours.The similarity among users and items has been calculated based on the distances.
    2) When we are building personlaized recommended systems.
    3) Amazon, Myntra , Netflix are the good example of personlaized recommended systems.

12. We can used hybrid recommended systems. When the website has been just launched then we should use popularity based recommended system. Once people start login then we can capture the characteristics of user and then we can use collaborative filtering recommendation systems. Hybrid recommended system will perform well in major scenarios, even when we do not have personalized information about the user we can make him/her to click on popular items and capture the properties.