In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
data = pd.read_csv("ratings_Electronics.csv", header = None)

In [8]:
data.columns = ['userID','itemID','ratings','timestamp']

In [9]:
data.head()

Unnamed: 0,userID,itemID,ratings,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [10]:
rating_data = data.drop(columns = 'timestamp')

In [11]:
rating_data.shape

(7824482, 3)

In [12]:
pd.unique(rating_data['ratings'])

array([5., 1., 3., 2., 4.])

In [13]:
rating_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 3 columns):
userID     object
itemID     object
ratings    float64
dtypes: float64(1), object(2)
memory usage: 179.1+ MB


In [14]:
rating_data.isnull().sum()

userID     0
itemID     0
ratings    0
dtype: int64

In [15]:
rating_data.isna().sum()

userID     0
itemID     0
ratings    0
dtype: int64

In [16]:
rating_data.describe(include = 'all')

Unnamed: 0,userID,itemID,ratings
count,7824482,7824482,7824482.0
unique,4201696,476002,
top,A5JLAU2ARJ0BO,B0074BW614,
freq,520,18244,
mean,,,4.012337
std,,,1.38091
min,,,1.0
25%,,,3.0
50%,,,5.0
75%,,,5.0


### Importing the json data that is .gz format

import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

full_data = getDF('reviews_Electronics_5.json.gz')

full_data.shape

full_data.head(20)

full_data.columns

full_data.info()

full_data.isnull().sum()

full_data.isna().sum()

## Popularity recommendation
#### Recommended to the users irrespective of the user preferences based on popularity/rating

* Calculating the mean ratings of items which have more than 50 ratings given by the users.
    1. Filter out the items that have more than 50 ratigs given by the users.
    2. Group the dataset by itemID and introduce a column containing mean rating of each item.
    3. Sort the data based on the mean/average rating.
    4. These are the items that can be recommended to the users

In [17]:
rating_data.head(10)

Unnamed: 0,userID,itemID,ratings
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0
5,A1QGNMC6O1VW39,511189877,5.0
6,A3J3BRHTDRFJ2G,511189877,2.0
7,A2TY0BTJOTENPG,511189877,5.0
8,A34ATBPOK6HCHY,511189877,5.0
9,A89DO69P0XZ27,511189877,5.0


In [18]:
pd.unique(rating_data['itemID']).size

476002

In [19]:
itemID = pd.DataFrame(rating_data['itemID'].value_counts())

In [20]:
top_10_popularity_purchases = itemID.head(10)

In [21]:
itemID['itemID_'] = itemID.index 

In [22]:
itemID.columns = ['count','itemID']

In [23]:
itemID.reset_index(inplace = True)

In [24]:
itemID.drop(columns = 'index', inplace = True)

In [25]:
#for i in range(0,userID['count'].size):
#    if userID.iloc[i,0] < 50:
#        #print(i)
#        userID.drop(index = i, inplace = True)
#        
#userID.iloc[26226,:]
#itemID.drop(index = 26226, inplace = True)

In [26]:
itemID_filtered = itemID[itemID.iloc[:,0] >= 50]

In [27]:
itemID_filtered.shape

(26226, 2)

In [28]:
#rating_data['itemID']. 
#rating_data.equals(itemID_filtered

In [29]:
rating_data_filtered = rating_data[rating_data.itemID.isin(itemID_filtered['itemID'])]

In [30]:
rating_data.shape

(7824482, 3)

In [31]:
rating_data_filtered.shape

(5374313, 3)

In [32]:
popular_items = rating_data_filtered.groupby(['itemID'])['ratings'].mean()

In [33]:
popular_items = pd.DataFrame(popular_items)

In [34]:
popular_items.reset_index(inplace = True)

In [35]:
popular_items.sort_values(by = 'ratings', ascending=False, inplace = True)

In [36]:
popular_items.head(10)

Unnamed: 0,itemID,ratings
9273,B002E6R7NG,4.980392
14750,B004I763AW,4.966667
12152,B003J9QQWU,4.964286
13648,B0043ZLFXE,4.955556
4980,B000TMFYBO,4.953125
25892,B00GMRCAC6,4.951872
21219,B008I6RVZU,4.951456
24405,B00CG70K78,4.949367
1228,B0000DYV9H,4.947368
291,B000053HC5,4.945783


In [37]:
popular_items.reset_index(inplace = True, drop = True)

In [38]:
top_10_popularity_rating = popular_items.head(10)

In [39]:
top_10_popularity_rating

Unnamed: 0,itemID,ratings
0,B002E6R7NG,4.980392
1,B004I763AW,4.966667
2,B003J9QQWU,4.964286
3,B0043ZLFXE,4.955556
4,B000TMFYBO,4.953125
5,B00GMRCAC6,4.951872
6,B008I6RVZU,4.951456
7,B00CG70K78,4.949367
8,B0000DYV9H,4.947368
9,B000053HC5,4.945783


In [40]:
top_10_popularity_purchases

Unnamed: 0,itemID
B0074BW614,18244
B00DR0PDNE,16454
B007WTAJTO,14172
B0019EHU8G,12285
B006GWO5WK,12226
B003ELYQGG,11617
B003ES5ZUU,10276
B007R5YDYA,9907
B00622AG6S,9823
B0002L5R78,9487


### Building Popularity Recommendation Model

In [2]:
from surprise import SlopeOne 
from surprise import Dataset,Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from surprise import Prediction

In [3]:
reader = Reader(rating_scale=(1, 5))

In [41]:
data = Dataset.load_from_df(rating_data_filtered[['userID', 'itemID', 'ratings']], reader)

In [42]:
trainset, testset = train_test_split(data, test_size=.30,random_state=123)

In [43]:
svd_model = SlopeOne()
svd_model.fit(trainset)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x18b1ef525c0>

In [46]:
# Evalute on test set
test_pred = svd_model.test(testset)


In [47]:
# compute RMSE
accuracy.rmse(test_pred)

RMSE: 1.4487


1.448727089867087

In [48]:
test_pred_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in test_pred])
test_pred_df.head()

Unnamed: 0,0,1,2
0,A3N69X2NUXGUY8,B00IVPU786,5.0
1,A3I1BJIFFM4S21,B00003G1RG,5.0
2,A232PNJSDU7295,B004T9RR6I,4.063422
3,A182ZXQELW2BSM,B004TERRRM,4.063422
4,AS9HM4I2QGLDD,B009YQ8BTI,4.063422


In [52]:
test_pred_df.columns = ["userID","itemID","est_rating"]
test_pred_df.sort_values(by = ["est_rating"],ascending=False,inplace=True)

In [53]:
top_10_recos = test_pred_df.groupby("userID").head(10).reset_index(drop=True)

In [55]:
top_10_recos.head(10)

Unnamed: 0,userID,itemID,est_rating
0,A3S3NGWOXWKXKH,B002PO15GC,5.0
1,A3RVR2AOBWPDNI,B001PIBE8I,5.0
2,A3RVR4JS4Y5244,B004ZMICYA,5.0
3,A3RVRBDMD0F66X,B001AAVA08,5.0
4,A3RVRX6717UTI,B008THTWIW,5.0
5,A3RVS7J3LGUZM,B00BCGRQ5S,5.0
6,A3RVS7R9EVTDHV,B00BFDHVAS,5.0
7,A3RVS7R9EVTDHV,B00D3F7FRU,5.0
8,A3RVS7R9EVTDHV,B008COJXHC,5.0
9,A3RVS7R9EVTDHV,B006LP0FXA,5.0


In [56]:
test_set = pd.DataFrame(testset)

In [57]:
test_set.columns = ['userID', 'itemID', 'ratings']

In [58]:
test_set.sort_values(by = ["ratings"],ascending=False,inplace=True)

In [59]:
test_set.merge(top_10_recos,how='inner').head(10)

Unnamed: 0,userID,itemID,ratings,est_rating
0,A3N69X2NUXGUY8,B00IVPU786,5.0,5.0
1,A3KQRQEYKGN0PR,B001342KM8,5.0,5.0
2,AGHBE9L2JFCMW,B00CMEN95U,5.0,1.0
3,A154JJUW0RIVX3,B006OBGEHW,5.0,4.0
4,A29YK4FZS9WSOT,B005W3DGKQ,5.0,4.063422
5,A3AQ7CODNC9L93,B00B8P8IVO,5.0,4.063422
6,A3PIQXF6EU1VK7,B00BB0ZTJA,5.0,4.063422
7,A37CIHKVU2JUBC,B005F6O2FI,5.0,3.5
8,A16NIG1JUE0TIR,B003C2XSO8,5.0,4.063422
9,A3FYCLG2QNEGJM,B001IA3SZ0,5.0,4.063422


## Corraborative Filtering 

In [12]:
reader = Reader(rating_scale=(1, 5))

In [26]:
data = Dataset.load_from_df(rating_data_filtered[['userID', 'itemID', 'ratings']], reader)

In [27]:
trainset, testset = train_test_split(data, test_size=.30,random_state=123)

In [31]:
svd_model = SVD(n_factors=50,biased=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f51dccd518>

In [33]:
# Evalute on test set
test_pred = svd_model.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 2.4013


2.4012517667037474

In [34]:
test_pred_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in test_pred])
test_pred_df.head()

Unnamed: 0,0,1,2
0,A3N69X2NUXGUY8,B00IVPU786,1.0
1,A3I1BJIFFM4S21,B00003G1RG,2.649229
2,A232PNJSDU7295,B004T9RR6I,4.063422
3,A182ZXQELW2BSM,B004TERRRM,4.063422
4,AS9HM4I2QGLDD,B009YQ8BTI,4.063422


In [36]:
test_pred_df.columns = ["userID","itemID","est_rating"]
test_pred_df.sort_values(by = ["userID", "est_rating"],ascending=False,inplace=True)

In [37]:
test_pred_df.head()

Unnamed: 0,userID,itemID,est_rating
501952,AZZZSIK7NFFVP,B009FU8BR0,4.063422
260689,AZZZJXM9GW3C5,B000FGEC94,4.063422
806578,AZZZDSAJ757Z4,B007KXI2R8,4.063422
236427,AZZZBHHLU1CMM,B0015ASM2W,4.063422
72495,AZZZBEVIZYROM,B007IL4ROE,4.063422


In [39]:
top_10_recos = test_pred_df.groupby("userID").head(10).reset_index(drop=True)

In [48]:
top_10_recos.head(10)

Unnamed: 0,userID,itemID,est_rating
0,AZZZSIK7NFFVP,B009FU8BR0,4.063422
1,AZZZJXM9GW3C5,B000FGEC94,4.063422
2,AZZZDSAJ757Z4,B007KXI2R8,4.063422
3,AZZZBHHLU1CMM,B0015ASM2W,4.063422
4,AZZZBEVIZYROM,B007IL4ROE,4.063422
5,AZZZ3LGTCGUZF,B004M8SBCK,4.063422
6,AZZYYPNQ2ZYH6,B00GHGMCN4,4.063422
7,AZZYYC8OX8ELM,B001O9EUSO,1.0
8,AZZYXVW7BAJVD,B000F763NG,4.063422
9,AZZYWDOD33FFE,B000067RC4,4.063422


In [52]:
test_set = pd.DataFrame(testset)

In [55]:
test_set.columns = ['userID', 'itemID', 'ratings']

In [58]:
test_set.sort_values(by = ["userID", "ratings"],ascending=False,inplace=True)

In [62]:
test_set.merge(top_10_recos,how='inner').head(10)

Unnamed: 0,userID,itemID,ratings,est_rating
0,AZZZSIK7NFFVP,B009FU8BR0,5.0,4.063422
1,AZZZJXM9GW3C5,B000FGEC94,5.0,4.063422
2,AZZZDSAJ757Z4,B007KXI2R8,4.0,4.063422
3,AZZZBHHLU1CMM,B0015ASM2W,4.0,4.063422
4,AZZZBEVIZYROM,B007IL4ROE,5.0,4.063422
5,AZZZ3LGTCGUZF,B004M8SBCK,5.0,4.063422
6,AZZYYPNQ2ZYH6,B00GHGMCN4,5.0,4.063422
7,AZZYYC8OX8ELM,B001O9EUSO,5.0,1.0
8,AZZYXVW7BAJVD,B000F763NG,5.0,4.063422
9,AZZYWDOD33FFE,B000067RC4,1.0,4.063422
