In [1]:
import pickle
import pandas as pd

from abc import ABC, abstractmethod
from src.config import Config
from src.dataset import Interactions
from src.metrics import PrecisionAtK, RecallAtK

In [2]:
assert Config.PREPROCESSED_INTERACTIONS_PATH.exists() == True

In [3]:
with open(Config.PREPROCESSED_INTERACTIONS_PATH, "rb") as file_object:
    interactions = Interactions(pickle.load(file_object))

In [4]:
interactions.train

Unnamed: 0,user_id,item_id,progress,rating,start_date,interest_score
0,126706,14433,80,,2018-01-01,0.400
1,127290,140952,58,,2018-01-01,0.290
2,66991,198453,89,,2018-01-01,0.445
3,46791,83486,23,5.0,2018-01-01,0.615
4,79313,188770,88,5.0,2018-01-01,0.940
...,...,...,...,...,...,...
1532993,153908,98585,44,,2019-02-11,0.220
1532994,154008,251969,4,,2018-04-08,0.020
1532995,154892,298192,68,5.0,2019-02-16,0.840
1532996,156948,38118,78,5.0,2018-08-19,0.890


In [5]:
interactions.test

Unnamed: 0,user_id,item_id,progress,rating,start_date,interest_score
1517914,38753,135245,0,,2019-12-25,0.000
1517915,101642,319500,67,5.0,2019-12-25,0.835
1517916,13548,251184,0,,2019-12-25,0.000
1517917,130425,193445,98,,2019-12-25,0.490
1517918,93986,80733,47,,2019-12-25,0.235
...,...,...,...,...,...,...
1530838,141930,219928,90,,2019-12-30,0.450
1530839,53358,42887,58,,2019-12-30,0.290
1530840,151170,284652,27,,2019-12-30,0.135
1530841,141293,273421,48,,2019-12-30,0.240


In [6]:
class BaseLine(ABC):
    def __init__(self, max_K:int):
        self.max_K = max_K
        self.recommendations = []
    
    def __validate_max_K(self, K):
        if K > self.max_K:
            raise ValueError(f"There are only {self.max_K} recommendations in the model.")
    
    @abstractmethod
    def fit(self, train: pd.DataFrame) -> None:
        pass

    def recommend(self, K: int) -> pd.Series:
        self.__validate_max_K(K)
        return self.recommendations[:K]
        
    
class FreqBaseLine(BaseLine):
    def fit(self, train):
        self.recommendations = pd.Series(train["item_id"].value_counts().head(self.max_K).index)

    
class RatingBaseLine(BaseLine):
    def fit(self, train):
        self.recommendations = pd.Series(
            train[~train["rating"].isnull()].groupby("item_id")["rating"].sum(["rating"]).sort_values(ascending=False).head(self.max_K).index
        )


class InterestScoreBaseline(BaseLine):
    def fit(self, train):
        self.recommendations = pd.Series(
            train.groupby("item_id")["interest_score"].sum(["interest_score"]).sort_values(ascending=False).head(self.max_K).index
        )

In [7]:
freq_model = FreqBaseLine(Config.BASE_RECS_COUNT)
freq_model.fit(interactions.train)

In [8]:
rating_model = RatingBaseLine(Config.BASE_RECS_COUNT)
rating_model.fit(interactions.train)

In [9]:
score_model = InterestScoreBaseline(Config.BASE_RECS_COUNT)
score_model.fit(interactions.train)

In [10]:
models = {
    "freq": freq_model,
    "rating": rating_model,
    "score": score_model
}

In [11]:
precision = PrecisionAtK(Config.K)
recall = RecallAtK(Config.K)
metrics = {
    "precision": precision,
    "recall": recall
}

In [12]:
for model_name, model in models.items():
    print(f"Model '{model_name}':")
    recs = model.recommend(Config.K)
    for metric_name, metric in metrics.items():
        metric_val = interactions.test["user_id"].drop_duplicates().apply(
            lambda user_id: 
                metric.calculate(
                    recs, 
                    interactions.test[interactions.test["user_id"] == user_id]["item_id"]
                )
        ).mean()
        print(f"\tMetric {metric_name}@{Config.K}: {metric_val}")

Model 'freq':
	Metric precision@10: 0.000412916887241927
	Metric recall@10: 0.002994949876897997
Model 'rating':
	Metric precision@10: 0.00035997882477501326
	Metric recall@10: 0.0027739544732662785
Model 'score':
	Metric precision@10: 0.0004976177871889889
	Metric recall@10: 0.003659196517851891


In [13]:
with Config.BASELINE_PATH.open("wb") as file_object:
    pickle.dump(score_model, file_object)