<b> using surprise package create a collaborative filtering recommendation system</b>

In [272]:
# 0. Imports
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import surprise

from surprise import SVD, CoClustering # algorithm for RS
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

In [253]:
# 1. Init data

# columns names
users_columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']  
movies_columns = ['MovieID', 'Title', 'Genres'] 
ratings_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

# load files
users = pd.read_csv("dataset/movie_lens_1M/users.dat", delimiter='::', header=None)
movies = pd.read_csv("dataset/movie_lens_1M/movies.dat", delimiter='::', header=None, encoding = "ISO-8859-1")
ratings = pd.read_csv("dataset/movie_lens_1M/ratings.dat", delimiter='::', header=None)

# set column names
users.columns = users_columns
movies.columns = movies_columns
ratings.columns = ratings_columns

In [254]:
# 2. Prepare data

# merge users table
interactions = ratings.merge(users, on='UserID')

# split Zip-code
interactions['National_area'] = interactions['Zip-code'].apply(lambda x: x[:1])
interactions['Sectional_center'] = interactions['Zip-code'].apply(lambda x: x[1:3])
interactions['Delivery_area'] = interactions['Zip-code'].apply(lambda x: x[3:])

# drop Zip-code
interactions.drop(columns=['Zip-code'], inplace=True)

# merge movies table
interactions = interactions.merge(movies, on='MovieID')

In [255]:
interactions.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,National_area,Sectional_center,Delivery_area,Title,Genres
0,1,1193,5,978300760,F,1,10,4,80,67,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,4,80,67,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,4,80,67,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,4,80,67,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,4,80,67,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [257]:
# 3. Init surprise objects; look for algorithm params

# special format for surprise
df = pd.DataFrame({
    'user_id': interactions.UserID,
    'item_id': interactions.Title,
    'interaction': interactions.Rating
})

# create objects Reader and Dataset
reader = Reader(rating_scale=(df.interaction.min(), df.interaction.max()))
dataset = Dataset.load_from_df(df, reader)

# init params grid
parameters = {
    'n_cltr_u': [i for i in range(10, 22, 3)], 'n_epochs': [i for i in range(25, 46, 5)]
}

# Init algorithm
algo = CoClustering()

# run grid search
gscv = GridSearchCV(CoClustering, parameters, cv=5, measures=['RMSE'])
gscv.fit(dataset)


# best RMSE score
print(gscv.best_score["rmse"])
# combination of parameters that gave the best RMSE score
print(gscv.best_params["rmse"])

0.9043804611682811
{'n_cltr_u': 19, 'n_epochs': 30}


In [294]:
# 4. Cross validate 

# init the best gotten model
algo = CoClustering(n_cltr_u = gscv.best_params["rmse"].get('n_cltr_u'), n_epochs = gscv.best_params["rmse"].get('n_epochs'))

cross_validate(algo, dataset, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9028  0.9100  0.9067  0.9082  0.9037  0.9063  0.0027  
Fit time          14.12   13.84   15.22   14.41   13.97   14.31   0.49    
Test time         0.32    1.55    0.89    0.37    0.89    0.80    0.45    


{'test_rmse': array([0.90283702, 0.910017  , 0.90668181, 0.90819844, 0.90371206]),
 'fit_time': (14.121799945831299,
  13.837179899215698,
  15.21561598777771,
  14.414459943771362,
  13.968783140182495),
 'test_time': (0.31636881828308105,
  1.5470731258392334,
  0.889941930770874,
  0.36760997772216797,
  0.8940680027008057)}

In [296]:
# 5. Demonstrate recommendation example

# split data
trainset, testset = train_test_split(dataset, test_size=0.3, random_state=1)

# fit 
algo.fit(trainset)
#get pred
pred = algo.test(testset)

user = 4404
title = 'Very Bad Things (1998)'

# gotten pred
print(f"- gotten prediction: {[p for p in pred if p.uid==user and p.iid==title]}")
print()

# the same result again
new_pred = algo.predict(uid=user, iid=title)
print(f"- the reproduced result: {new_pred}\n")

# real data
interactions[['UserID', 'Title', 'Rating']].loc[(interactions.UserID == user) & (interactions.Title==title)]

- gotten prediction: [Prediction(uid=4404, iid='Very Bad Things (1998)', r_ui=3.0, est=2.3218292833163057, details={'was_impossible': False})]

- the reproduced result: user: 4404       item: Very Bad Things (1998) r_ui = None   est = 2.32   {'was_impossible': False}



Unnamed: 0,UserID,Title,Rating
736644,4404,Very Bad Things (1998),3
