In [3]:
%pip install scipy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

import surprise
from surprise import SVD
from surprise import Reader, Dataset

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

## Loading the dataset and label encoding categorical features

In [5]:
data = pd.read_csv("../data/internim/user_movie_ratings.csv")
data.head(5)

Unnamed: 0,user_id,movie_id,rating,title,release_date,age,sex,occupation,zip_code
0,196,242,3,Kolya (1996),b'24-Jan-1997',49,M,writer,55105
1,196,257,2,Men in Black (1997),b'04-Jul-1997',49,M,writer,55105
2,196,111,4,"Truth About Cats & Dogs, The (1996)",b'26-Apr-1996',49,M,writer,55105
3,196,25,4,"Birdcage, The (1996)",b'08-Mar-1996',49,M,writer,55105
4,196,382,4,"Adventures of Priscilla, Queen of the Desert, ...",b'01-Jan-1994',49,M,writer,55105


In [6]:
data.drop(['title', 'release_date'], axis=1, inplace=True)

In [7]:
data.sample(5)

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
85103,591,523,4,57,F,librarian,92093
87142,829,318,5,48,M,writer,80209
53001,711,941,3,22,F,student,15203
34610,802,288,3,35,M,administrator,34105
26045,450,629,4,35,F,educator,11758


In [8]:
label_encoder = LabelEncoder()


data['sex'] = label_encoder.fit_transform(data['sex'])
data['occupation'] = label_encoder.fit_transform(data['occupation'])
data['zip_code'] = label_encoder.fit_transform(data['zip_code'])

In [9]:
data.sample(5)

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
20800,405,949,5,22,0,7,93
80757,815,479,4,32,1,13,239
360,296,13,3,43,0,0,144
71315,407,705,4,29,1,4,41
4428,202,318,1,41,0,3,467


## Now let's perform matrix formalization using Surprise library.

Used to incorporate collaborative filtering into our solution

In [10]:
# Train/test split 80/20

train_data=data.iloc[:int(data.shape[0]*0.80)]
test_data=data.iloc[int(data.shape[0]*0.80):]

In [11]:
train_data.shape ,test_data.shape

((80000, 7), (20000, 7))

In [12]:
# Standart Reader from surprise lib
reader = Reader(rating_scale=(1,5))

# Creating train and test data sets using surprise library

train_data_mf = Dataset.load_from_df(train_data[['user_id', 'movie_id', 'rating']], reader)
trainset = train_data_mf.build_full_trainset() 

test_data_mf = Dataset.load_from_df(test_data[['user_id', 'movie_id', 'rating']], reader)
testset = test_data_mf.build_full_trainset() 

In [13]:
# Let's set the baseline for our model using SVD from surprise Lib. We can use this result to enhance our main model 

svd = SVD(n_factors=100, biased=True, random_state=42, verbose=True)
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x238b286ddb0>

In [14]:
# Getting the predictions and storing them to pass a later

train_preds = svd.test(trainset.build_testset())
train_pred_mf = np.array([pred.est for pred in train_preds])

test_preds = svd.test(testset.build_testset())
test_pred_mf = np.array([pred.est for pred in test_preds])

In [15]:
# Calculating RMSE and MAE, for baseline
print("For train data:")
train_rmse = surprise.accuracy.rmse(train_preds, verbose=True) 
train_mae = surprise.accuracy.mae(train_preds, verbose=True)
print("For test data:")
test_rmse = surprise.accuracy.rmse(test_preds, verbose=True)
test_mae = surprise.accuracy.mae(test_preds, verbose=True)

For train data:
RMSE: 0.6674
MAE:  0.5281
For test data:
RMSE: 1.0558
MAE:  0.8423


## Let's add some features
Mainly: 
* Similar users(top 5)
* Similar movies(top 5, by ratings)

Global average for:
* Avg rating for all movies by users
* Avg rating for movie by all the users
* Avg rating for all movies by user

In [16]:
train_sparse_matrix = sparse.csr_matrix((train_data.rating.values, (train_data.user_id.values,
                                               train_data.movie_id.values))) # Introduce a sparse matrix for better performance
test_sparse_matrix = sparse.csr_matrix((test_data.rating.values, (test_data.user_id.values,
                                               test_data.movie_id.values)))

# Function to return user averages

def average_ratings(matrix, users):
    axis = 1 if users else 0
    sum_of_ratings = matrix.sum(axis=axis).A1
    is_rated = matrix != 0
    no_of_ratings = is_rated.sum(axis=axis).A1
    user, movie = matrix.shape
    avg_ratings = { i : sum_of_ratings[i]/no_of_ratings[i]
                                for i in range(user if users else movie) 
                                if no_of_ratings[i] != 0}
    return avg_ratings


In [17]:
# Calculating average rating for all movies
train_averages = dict()
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average
print(train_averages)

test_averages = dict()
test_global_average = test_sparse_matrix.sum()/test_sparse_matrix.count_nonzero()
test_averages['global'] = test_global_average

{'global': 3.5157375}


In [18]:
import itertools

train_averages['user'] = average_ratings(train_sparse_matrix, users=True)
out = dict(itertools.islice(train_averages['user'].items(), 5))
print("Avg rating for 5 users from data: \n" +  str(out))

train_averages['movie'] =  average_ratings(train_sparse_matrix, users=False)
out = dict(itertools.islice(train_averages['movie'].items(), 5))
print("Avg rating for 5 movies from data: \n" +  str(out))

test_averages['user'] = average_ratings(test_sparse_matrix, users=True)
test_averages['movie'] =  average_ratings(test_sparse_matrix, users=False)

Avg rating for 5 users from data: 
{1: 3.610294117647059, 2: 3.7096774193548385, 3: 2.7962962962962963, 5: 2.874285714285714, 6: 3.6350710900473935}
Avg rating for 5 movies from data: 
{1: 3.8435374149659864, 2: 3.2241379310344827, 3: 3.0, 4: 3.543956043956044, 5: 3.26027397260274}


In [19]:
train_users, train_movies, train_ratings = sparse.find(train_sparse_matrix)
test_users, test_movies, test_ratings = sparse.find(test_sparse_matrix)

In [20]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

train_df = pd.DataFrame()
test_df = pd.DataFrame()

def collect_dataframe(dataframe, users, movies, ratings, averages):
    for (usr, mov, rat) in zip(users, movies, ratings):
        if dataframe is train_df:
            # Get the similar movies to the current movie rated by part. user
            movie_sim = cosine_similarity(train_sparse_matrix[:, mov].T, train_sparse_matrix.T).ravel()
            top_sim_movies = movie_sim.argsort()[::-1][1:]
            top_ratings_mov = train_sparse_matrix[usr, top_sim_movies].toarray().ravel()
            top_sim_movies_ratings = list(top_ratings_mov[top_ratings_mov != 0][:5])
            top_sim_movies_ratings.extend([train_averages['user'][usr]] * (5 - len(top_sim_movies_ratings)))

            # Get the similar users to the current user
            user_sim = cosine_similarity(train_sparse_matrix[usr], train_sparse_matrix).ravel()
            top_sim_users = user_sim.argsort()[::-1][1:]
            top_ratings_usr = train_sparse_matrix[top_sim_users, mov].toarray().ravel()
            top_sim_users_ratings = list(top_ratings_usr[top_ratings_usr != 0][:5])
            top_sim_users_ratings.extend([train_averages['movie'][mov]] * (5 - len(top_sim_users_ratings)))
        
        elif dataframe is test_df:
            user_sim = cosine_similarity(test_sparse_matrix[usr], test_sparse_matrix).ravel()
            top_sim_users = user_sim.argsort()[::-1][1:]
            top_ratings_usr = test_sparse_matrix[top_sim_users, mov].toarray().ravel()
            top_sim_users_ratings = list(top_ratings_usr[top_ratings_usr != 0][:5])
            top_sim_users_ratings.extend([test_averages['movie'][mov]]*(5 - len(top_sim_users_ratings)))

            movie_sim = cosine_similarity(test_sparse_matrix[:,mov].T, test_sparse_matrix.T).ravel()
            top_sim_movies = movie_sim.argsort()[::-1][1:]
            top_ratings = test_sparse_matrix[usr, top_sim_movies].toarray().ravel()
            top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_movies_ratings.extend([test_averages['user'][usr]]*(5-len(top_sim_movies_ratings)))


        # Collect everything to the dataset
        data_obtained = list()
        data_obtained.append(usr)
        data_obtained.append(mov)
        data_obtained.append(averages['global'])
        data_obtained.extend(top_sim_users_ratings)
        data_obtained.extend(top_sim_movies_ratings)
        data_obtained.append(averages['user'][usr])
        data_obtained.append(averages['movie'][mov])
        data_obtained.append(rat)

        dataframe = dataframe.append([data_obtained])

    return dataframe

train_df = collect_dataframe(train_df,  train_users, train_movies, train_ratings, train_averages)
test_df = collect_dataframe(test_df,  test_users, test_movies, test_ratings, test_averages)

KeyboardInterrupt: 

In [None]:
train_df.shape

(80000, 16)

In [None]:
train_df.columns=['user', 'movie', 'AvgR', 'sim_rat_u1', 'sim_rat_u2', 'sim_rat_u3', 'sim_rat_u4', 'sim_rat_u5',
            'sim_rat_m1', 'sim_rat_m2', 'sim_rat_m3', 'sim_rat_m4', 'sim_rat_m5', 'Usr_Avg', 'Mov_Avg', 'rating']

train_df['svd_mf'] = train_pred_mf

test_df.columns=['user', 'movie', 'AvgR', 'sim_rat_u1', 'sim_rat_u2', 'sim_rat_u3', 'sim_rat_u4', 'sim_rat_u5',
            'sim_rat_m1', 'sim_rat_m2', 'sim_rat_m3', 'sim_rat_m4', 'sim_rat_m5', 'Usr_Avg', 'Mov_Avg', 'rating']

test_df['svd_mf'] = test_pred_mf

In [None]:
# Save resulting df's as it takes time to collect the dataset

train_df.to_csv('../data/external/final_train_svd.csv', index=False)
test_df.to_csv('../data/external/final_test_svd.csv', index=False)

## Implementing XGBoost model:

There I have implemented the XGBoost model and got some results

In [None]:
def get_error_metrics(y_true, y_pred):
    rmse = np.sqrt(np.mean([ (y_true[i] - y_pred[i])**2 for i in range(len(y_pred)) ]))
    mape = np.mean(np.abs( (y_true - y_pred)/y_true )) * 100
    return rmse, mape

In [None]:
x_train = train_df.drop(['user', 'movie','rating'], axis=1)
y_train = train_df['rating']

x_test = test_df.drop(['user','movie','rating'], axis=1)
y_test = test_df['rating']

In [None]:
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13, random_state=15, n_estimators=100)


train_results = dict()
test_results = dict()

print('Training the model..')
xgb_model.fit(x_train, y_train, eval_metric = 'rmse')
print('Done')

Training the model..
Done


In [None]:
y_test_pred = xgb_model.predict(x_test) 
rmse_test, mape_test = get_error_metrics(y_true=y_test.values, y_pred=y_test_pred)
test_results = {'rmse': rmse_test,
                    'mape' : mape_test,
                    'predictions':y_test_pred}

Evaluating Test data


In [None]:
test_results # Evaluation of the model

{'rmse': 0.9601231849841207,
 'mape': 30.362102796668804,
 'predictions': array([3.4052298, 3.2814798, 3.9577622, ..., 2.0483723, 1.7854112,
        3.6811721], dtype=float32)}

## Lets look at some predictions!

Considering the fact, that model never seen this data, we can suggest user some movies, based on predicted rating that might be interesting to him/her

In [None]:
test_df['predicted_rating'] = xgb_model.predict(x_test)

In [66]:
recommendations = pd.read_csv('../references/user_recs.csv')

def print_user_ratings(recommendations_df, user_id, top_n):
    user_ratings = recommendations_df[recommendations_df['user'] == user_id]
    user_ratings = user_ratings.sort_values(by='predicted_rating', ascending=False).head(top_n)
    
    print(user_ratings)

user_id = 17
top_n = 5 
print_user_ratings(recommendations, user_id, top_n)


     user  movie  predicted_rating
98     17    471          3.589669
104    17    269          3.548952
102    17    100          3.513165
86     17    475          3.463399
85     17    744          3.454559


In [65]:
recommendations.to_csv('../references/user_recs.csv', index=False)

## Saving model

In [None]:
xgb_model.save_model('../models/model.json')

In [26]:
xgb_model = xgb.Booster()
xgb_model.load_model('../models/model.json')