In [1]:
!pip install rectools



In [2]:
!git clone https://github.com/Evgeneugene/Movie-Rec-System.git

fatal: destination path 'Movie-Rec-System' already exists and is not an empty directory.


In [6]:
%cd /content/Movie-Rec-System/notebooks

/content/Movie-Rec-System/notebooks


# Checking score for PopularModel, PureSVDModel and RandomModel

In [9]:
import pandas as pd
import numpy as np
from rectools import Columns
from rectools.dataset import Dataset

data_interim_dir = '../data/interim/'
user_groups = ['u1', 'u2', 'u3', 'u4', 'u5']
data_splits = ['base', 'test']

datasets = {}

for user_group in user_groups:
    for split in data_splits:
        # Construct file paths
        interactions_path = f"{data_interim_dir}{user_group}.{split}.csv"
        user_features_path = f"{data_interim_dir}{user_group}.{split}_user_features.csv"
        item_features_path = f"{data_interim_dir}{user_group}.{split}_item_features.csv"

        # Read the data from CSV files
        interactions_df = pd.read_csv(interactions_path)
        user_features_df = pd.read_csv(user_features_path)
        item_features_df = pd.read_csv(item_features_path)

        dataset = Dataset.construct(
            interactions_df,
            user_features_df=user_features_df,
            cat_user_features=['gender', 'occupation'],  # If these were the categorical features
            item_features_df=item_features_df,
            make_dense_item_features=True  # If this is still applicable
        )

        # Store in the data dictionary
        if user_group not in datasets:
            datasets[user_group] = {}

        datasets[user_group][split] = (dataset, interactions_df)

In [75]:
from rectools.models import ImplicitALSWrapperModel, ImplicitItemKNNWrapperModel, LightFMWrapperModel, RandomModel, PureSVDModel, PopularModel
from rectools.metrics import NDCG, Accuracy, MAP, Recall
from rectools import Columns
from sklearn.metrics import mean_squared_error

k = 10
ndcg = NDCG(k=k, log_base=3)
recall = Recall(k=k)
mmap = MAP(k=k)

# Define a list of models to test
models = [
    ('PureSVD', PureSVDModel()),
    ('Popular', PopularModel()),
    ('Random', RandomModel()),
]

# List to store the results
results = []
models_recs = []
for ug in datasets.keys():
    base_ds = datasets[ug]['base'][0]
    base_df = datasets[ug]['base'][1]
    test_ds = datasets[ug]['test'][0]
    test_df = datasets[ug]['test'][1]

    for model_name, model in models:
        # Fit the model
        model.fit(base_ds)

        # Generate recommendations
        recs = model.recommend(
            users=test_df[Columns.User].unique(),
            dataset=base_ds,
            k=10,
            filter_viewed=True,
        )

        # Evaluate the model
        map_score = mmap.calc(reco=recs, interactions=test_df)
        recall_score = recall.calc(reco=recs, interactions=test_df)
        ndcg_score = ndcg.calc(reco=recs, interactions=test_df)

        # Calculate RMSE
        recs.rename(columns={Columns.Score: Columns.Weight}, inplace=True)
        merged_data = pd.merge(recs, test_df, on=[Columns.User, Columns.Item], suffixes=('_predicted', '_test'))
        rmse = np.sqrt(mean_squared_error(merged_data[Columns.Weight + '_test'], merged_data[Columns.Weight + '_predicted']))

        # Append results to the list
        results.append({
            'User Group': ug,
            'Model': model_name,
            'MAP': map_score,
            'Recall': recall_score,
            'NDCG': ndcg_score,
            'RMSE' : rmse,
        })
        models_recs.append(recs)
# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save to CSV
results_df.to_csv('model_evaluation_results.csv', index=False)
print(results_df)

   User Group    Model       MAP    Recall      NDCG        RMSE
0          u1  PureSVD  0.124901  0.179516  0.528659    2.465805
1          u1  Popular  0.051588  0.097598  0.320244  366.877314
2          u1   Random  0.001633  0.005260  0.028859    3.709900
3          u2  PureSVD  0.135176  0.208064  0.449230    2.150981
4          u2  Popular  0.056542  0.117283  0.264403  357.369710
5          u2   Random  0.001951  0.006674  0.021253    3.249868
6          u3  PureSVD  0.129553  0.217731  0.379099    2.066222
7          u3  Popular  0.056688  0.114969  0.213131  352.939858
8          u3   Random  0.001572  0.006048  0.013966    3.537231
9          u4  PureSVD  0.137221  0.223266  0.371427    2.108168
10         u4  Popular  0.052612  0.118290  0.197150  351.359170
11         u4   Random  0.002368  0.006675  0.016631    3.903346
12         u5  PureSVD  0.135310  0.224212  0.350231    2.152631
13         u5  Popular  0.054322  0.119300  0.189351  352.519415
14         u5   Random  0

In [12]:
average_metrics = results_df.groupby('Model').mean().reset_index()

std_dev_metrics = results_df.groupby('Model').std().reset_index()
# Save the detailed results to CSV
results_df.to_csv('model_evaluation_results.csv', index=False)

# Save the average metrics to a new CSV
average_metrics.to_csv('model_average_metrics.csv', index=False)

# Print the average metrics DataFrame
average_metrics

  average_metrics = results_df.groupby('Model').mean().reset_index()
  std_dev_metrics = results_df.groupby('Model').std().reset_index()


Unnamed: 0,Model,MAP,Recall,NDCG,RMSE
0,Popular,0.054351,0.113488,0.236856,356.213094
1,PureSVD,0.132432,0.210558,0.415729,2.188762
2,Random,0.001843,0.006507,0.017979,3.672863


In [74]:
print(models_recs[0].head(20))

    user_id  item_id    weight  rank
0         1      100  3.421995     1
1         1      174  3.207084     2
2         1       56  2.726775     3
3         1       12  2.680475     4
4         1      475  2.531188     5
5         1      210  2.457114     6
6         1      318  2.230511     7
7         1      151  2.190664     8
8         1       98  2.150161     9
9         1      408  2.141149    10
10        2      313  1.928529     1
11        2       50  1.912231     2
12        2        9  1.836004     3
13        2       15  1.604099     4
14        2      124  1.532208     5
15        2      117  1.525952     6
16        2      471  1.463370     7
17        2      137  1.406917     8
18        2      328  1.311521     9
19        2      333  1.303981    10


# LightFMWrapperModel best factor search

In [None]:
!pip install rectools[lightfm]

In [15]:
from rectools.models import LightFMWrapperModel
from lightfm import LightFM

In [None]:
from rectools.metrics import NDCG, Recall, MAP
from rectools import Columns
import pandas as pd

# Define metrics
k = 10
ndcg_metric = NDCG(k=k)
recall_metric = Recall(k=k)
map_metric = MAP(k=k)

# List to store the results
results = []
for ug in datasets.keys():
    base_ds = datasets[ug]['base'][0]
    base_df = datasets[ug]['base'][1]
    test_ds = datasets[ug]['test'][0]
    test_df = datasets[ug]['test'][1]

    def make_base_model(factors: int):
        return LightFMWrapperModel(LightFM(no_components=factors, loss="bpr"))

    fitted_models = {}

    factors = [10, 20] + list(range(64, 385, 64))
    for n_factors in factors:
        model = make_base_model(n_factors)
        model.fit(base_ds)
        fitted_models[n_factors] = model

    for n_factors, model in fitted_models.items():
        # Generate recommendations
        recs = model.recommend(
            users=test_df[Columns.User].unique(),
            dataset=base_ds,  # Use the same dataset as used for training
            k=k,
            filter_viewed=True
        )

        # Evaluate the model
        ndcg_score = ndcg_metric.calc(reco=recs, interactions=test_df)
        recall_score = recall_metric.calc(reco=recs, interactions=test_df)
        map_score = map_metric.calc(reco=recs, interactions=test_df)

        # RMSE
        recs.rename(columns={Columns.Score: Columns.Weight}, inplace=True)
        merged_data = pd.merge(recs, test_df, on=[Columns.User, Columns.Item], suffixes=('_predicted', '_test'))
        rmse = np.sqrt(mean_squared_error(merged_data[Columns.Weight + '_test'], merged_data[Columns.Weight + '_predicted']))

        # Store results
        results.append({
            'Fold' : ug,
            'Factors': n_factors,
            'NDCG': ndcg_score,
            'Recall': recall_score,
            'MAP': map_score,
            'RMSE' : rmse
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

In [87]:
# Group by number of factors and calculate the average for each metric
average_results = results_df.groupby('Factors').mean().reset_index()

  average_results = results_df.groupby('Factors').mean().reset_index()


In [91]:
average_results

Unnamed: 0,Factors,NDCG,Recall,MAP,RMSE
0,10,0.221648,0.078979,0.04376,42.636597
1,20,0.231141,0.083301,0.046841,41.882701
2,64,0.259504,0.105807,0.058514,40.757604
3,128,0.284097,0.128887,0.070554,40.064808
4,192,0.277933,0.124825,0.069523,39.892247
5,256,0.272259,0.129972,0.070233,39.128541
6,320,0.280287,0.133455,0.072139,39.415287
7,384,0.274126,0.130782,0.070095,39.718312


Seems like factor = 256 is the best value for LightFMWrapperModel. However, PureSVDModel anyway performs better

# Best model saving

In [94]:
import pickle

best_model = PureSVDModel()

for ug in datasets.keys():
    base_ds = datasets[ug]['base'][0]
    base_df = datasets[ug]['base'][1]
    test_ds = datasets[ug]['test'][0]
    test_df = datasets[ug]['test'][1]

    # Fit the model
    best_model.fit(base_ds)

model_path = '../models/best_model.pickle'

with open(model_path, 'wb') as pickle_file:
    pickle.dump(best_model, pickle_file)