In [2]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
import math

from recommenders.utils.timer import Timer
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions


from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise import accuracy


In [6]:
movies = pd.read_csv(r'C:\Users\mirza\Desktop\dataset\MovieLens-Credits\movies_metadata.csv', 
        encoding = "ISO-8859-1"
    )

movies = movies[movies["id"].apply(lambda x: x.isnumeric())]
movies["id"]= movies["id"].astype(int)
movies.rename(columns = {'id':"itemId"}, inplace = True)

import ast

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i["name"])
    
    return L
movies["genres"]= movies["genres"].apply(convert)

  movies = pd.read_csv(r'C:\Users\mirza\Desktop\dataset\MovieLens-Credits\movies_metadata.csv',


In [7]:
ratings_data= pd.read_csv(
        r'C:\Users\mirza\Desktop\dataset\MovieLens-Credits\ratings.csv', nrows=2500000,
        encoding = "ISO-8859-1",
        header = 0,
        names=["userId", "itemId", "rating", 'timestamp']
    )
ratings_data = pd.merge(ratings_data[["userId","itemId","rating"]], movies[["itemId","genres", "title"]], how="inner", on="itemId")

ratings = ratings_data[["userId","itemId", "rating"]]

In [8]:
ratings_data 

Unnamed: 0,userId,itemId,rating,genres,title
0,1,110,1.0,"[Drama, Mystery, Romance]",Three Colors: Red
1,11,110,3.5,"[Drama, Mystery, Romance]",Three Colors: Red
2,22,110,5.0,"[Drama, Mystery, Romance]",Three Colors: Red
3,24,110,5.0,"[Drama, Mystery, Romance]",Three Colors: Red
4,29,110,3.0,"[Drama, Mystery, Romance]",Three Colors: Red
...,...,...,...,...,...
1099674,25806,167858,5.0,"[Family, Fantasy, Adventure]",The Story of the Voyages
1099675,25808,8452,2.0,[Science Fiction],The 6th Day
1099676,25808,43828,4.0,"[Action, Comedy, Western]",Destry Rides Again
1099677,25808,116973,3.5,"[Adventure, Mystery]",Four Men and a Prayer


## Find best model results with sample of data

## Train model

In [14]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings, reader=reader)

train_set, test_set = train_test_split(data, test_size=.10)

In [15]:
from surprise.model_selection import KFold
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=3.5):

    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        user_ratings.sort(key=lambda x: x[0], reverse=True)

        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls



kf = KFold(n_splits=10)
algo = SVD(random_state=0, n_factors=300,reg_all=0.3, n_epochs=30, lr_all= 0.006)

for trainset, testset in kf.split(data):
    algo.fit(train_set)
    predictions = algo.test(test_set)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)


    print("RMSE: {}".format(accuracy.rmse(predictions)))
    print("MAE: {}".format(accuracy.mae(predictions)))
    print("Precision value : {}".format(sum(prec for prec in precisions.values()) / len(precisions)))
    print("Recalls value : {}".format(sum(rec for rec in recalls.values()) / len(recalls)))
    print(50*"-")

RMSE: 0.8912
RMSE: 0.891225080639343
MAE:  0.6920
MAE: 0.6920092082437453
Precision value : 0.3180782285437212
Recalls value : 0.22114511465750794
--------------------------------------------------
RMSE: 0.8912
RMSE: 0.891225080639343
MAE:  0.6920
MAE: 0.6920092082437453
Precision value : 0.3180782285437212
Recalls value : 0.22114511465750794
--------------------------------------------------
RMSE: 0.8912
RMSE: 0.891225080639343
MAE:  0.6920
MAE: 0.6920092082437453
Precision value : 0.3180782285437212
Recalls value : 0.22114511465750794
--------------------------------------------------
RMSE: 0.8912
RMSE: 0.891225080639343
MAE:  0.6920
MAE: 0.6920092082437453
Precision value : 0.3180782285437212
Recalls value : 0.22114511465750794
--------------------------------------------------
RMSE: 0.8912
RMSE: 0.891225080639343
MAE:  0.6920
MAE: 0.6920092082437453
Precision value : 0.3180782285437212
Recalls value : 0.22114511465750794
--------------------------------------------------
RMSE: 0.89

In [16]:
preds = pd.DataFrame(predictions)
preds.rename(columns={"uid": "userId", "iid": "itemId", "r_ui": "ratings", "est":"predicts"}, inplace=True)
preds.drop("details", axis=1, inplace=True)
preds

Unnamed: 0,userId,itemId,ratings,predicts
0,6659,2026,2.0,2.287446
1,17442,1249,4.0,3.820520
2,871,6552,4.0,3.843175
3,6893,2791,5.0,4.059408
4,8915,914,4.5,3.542818
...,...,...,...,...
109963,20461,4990,3.5,2.906697
109964,10050,1682,1.0,3.168629
109965,7606,216,2.0,3.583067
109966,13518,1682,3.0,3.908550


In [17]:
train_set

<surprise.trainset.Trainset at 0x1d94aec1940>

In [18]:
train = train_set.all_ratings()
train = pd.DataFrame(train, columns=['uid', 'iid', 'rating'])
train.rename(columns={"uid": "userId", "iid":"itemId", "rating":"ratings"}, inplace=True)
train

Unnamed: 0,userId,itemId,ratings
0,0,0,4.0
1,0,98,4.0
2,0,856,4.0
3,0,1240,2.0
4,0,1274,3.0
...,...,...,...
989706,25361,891,5.0
989707,25362,224,5.0
989708,25363,451,4.0
989709,25364,194,2.0


In [19]:
all_predictions = compute_ranking_predictions(algo, train, usercol='userId', itemcol='itemId', remove_seen=True)

In [10]:
pickle.dump(ratings, open("svd_ratings_data.pkl","wb" ))
pickle.dump(all_predictions, open("svd_all_predictions.pkl","wb" ))
pickle.dump(algo, open("model_svd.pkl","wb" ))
pickle.dump(ratings_data, open("ratings_data.pkl","wb" ))