In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, SVD, KNNBasic, accuracy
from surprise.model_selection import train_test_split, cross_validate

In [5]:
# reading the raw data
df = pd.read_csv('../Data/ratings_Electronics (1).csv', names=['userId', 'productId','Rating','timestamp'])
df.head()

Unnamed: 0,userId,productId,Rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     object 
 1   productId  object 
 2   Rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB


In [7]:
df.shape

(7824482, 4)

In [8]:
# converting productId and userid to string
df['productId'] = df['productId'].astype('str')
df['userId'] = df['userId'].astype('str')

In [9]:
# checking for missing values
df.isnull().sum()

userId       0
productId    0
Rating       0
timestamp    0
dtype: int64

In [10]:
# Processing the data to remove inactive users and unpopular products
# Removing users with less than 50 ratings  
user_counts = df['userId'].value_counts()
active_users = user_counts[user_counts >= 50].index
df = df[df['userId'].isin(active_users)]

# Removing products with less than 5 ratings
product_counts = df['productId'].value_counts()
popular_products = product_counts[product_counts >= 5].index
df = df[df['productId'].isin(popular_products)]

# checking the shape of the processed data
df.shape

Unnamed: 0,userId,productId,Rating,timestamp
1310,A3LDPF5FMB782Z,1400501466,5.0,1336003200
1322,A1A5KUIIIHFF4U,1400501466,1.0,1332547200
1335,A2XIOXRRYX0KZY,1400501466,3.0,1371686400
1451,AW3LX47IHPFRL,1400501466,5.0,1339804800
1456,A1E3OB6QMBKRYZ,1400501466,1.0,1350086400
...,...,...,...,...
7824423,A34BZM6S9L7QI4,B00LGQ6HL8,5.0,1405555200
7824424,A1G650TTTHEAL5,B00LGQ6HL8,5.0,1405382400
7824425,A25C2M3QF9G7OQ,B00LGQ6HL8,5.0,1405555200
7824426,A1E1LEVQ9VQNK,B00LGQ6HL8,5.0,1405641600


We managed to remove irrelevant data and reduced total data from 7900k to 66k records. 

In [11]:
# Saving the processed data as a new csv
df.to_csv('../Data/ratings_processed.csv', index=False)