In [119]:
import pandas as pd
import joblib
from pathlib import Path
from typing import Literal

from data_processor import process_datasets

## Define recommender

In [146]:
class Recommender:
    data_idx_mapping = {
        "2080": 1, 
        "disjoint": 2
    }
    def __init__(self, 
                 estimator: Literal["2080", "disjoint"] = "disjoint",
                 models_dir: Path | str = Path("../models"),
                 data_dir: Path | str = Path("../data/interim"),
                 raw_data_path: Path | str = Path("../data/raw/ml-100k")
                 ):
        self.estimator = estimator
        self.models_dir = models_dir
        self.data_dir = data_dir
        self.raw_data_path = raw_data_path
        self.u_data_path = raw_data_path / "u.data"

        data_idx = self.data_idx_mapping[estimator]
        self.train_x = pd.read_parquet(data_dir / f"train_x{data_idx}.parquet")
        self.test_x = pd.read_parquet(data_dir / f"test_x{data_idx}.parquet")
        self.train_y = pd.read_parquet(data_dir / f"train_y{data_idx}.parquet")
        self.test_y = pd.read_parquet(data_dir / f"test_y{data_idx}.parquet")

        self.model = joblib.load(models_dir / f"model_{estimator}.pkl")

    def predict(self, data):
        return self.model.predict(data)
    
    def recommend(self, user_ids, number_of_recs: int = 5):
        item_df = pd.read_csv(self.raw_data_path / "u.item", sep= "|", encoding='latin-1', header=None)
        item_df = item_df.iloc[:,:2]
        item_df.columns = ["item_id", "title"]

        u_data = pd.read_csv(self.u_data_path, sep="\t", header=None)
        u_data.columns = ["user_id", "item_id", "rating", "timestamp"]
        
        u_data = u_data[u_data["user_id"].isin(user_ids)]
        
        recommend_x, recommend_y = process_datasets(u_data)
        ratings = self.predict(recommend_x)

        recommend_x[["user_id", "item_id"]] = u_data[["user_id", "item_id"]]
        recommendation = pd.concat([recommend_x.reset_index(), pd.Series(ratings, name="rating")], axis=1)
        recommendation.rating *= 5

        recommendation = recommendation.join(item_df, on="item_id", lsuffix="_drop")
        recommendation = recommendation[["user_id", "title", "rating", "item_id"]]
        recommendation = recommendation.groupby(by=["user_id"]).apply(lambda x: x.sort_values(by="rating", ascending=False)[:number_of_recs])

        return recommendation

In [147]:
recommender = Recommender()

These user ids weren't used for the training of disjoint model. We can pick any of them to get the movies recommendations using the model. The 2080 model is there but is not representative enough as some user_id-item_id combinations could be in the training set.
## Inference

In [148]:
user_ids = pd.read_parquet(recommender.data_dir / "test_user_ids.parquet")
recommendation = recommender.recommend(user_ids["user_id"][:5])
recommendation

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,title,rating,item_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
160,259,160,"Maltese Falcon, The (1941)",4.609278,484
160,412,160,Jack (1996),4.568492,409
160,102,160,It Happened One Night (1934),4.540579,604
160,125,160,Cinema Paradiso (1988),4.516464,170
160,104,160,Legends of the Fall (1994),4.497709,51
264,213,264,"Manchurian Candidate, The (1962)",4.757523,657
264,99,264,Stand by Me (1986),4.478101,655
264,80,264,Forbidden Planet (1956),4.394106,434
264,82,264,Disclosure (1994),4.257335,43
264,232,264,Vertigo (1958),4.214149,479
