In [1]:
import pickle
import pandas as pd

from abc import ABC, abstractmethod
from src.config import Config
from src.dataset import DatasetProcessor
from src.metrics import PrecisionAtK, RecallAtK

In [2]:
assert Config.PREPROCESSED_INTERACTIONS_PATH.exists() == True

In [3]:
with open(Config.PREPROCESSED_INTERACTIONS_PATH, "rb") as file_object:
    interactions = pickle.load(file_object)

In [4]:
interactions

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,126706,14433,80,,2018-01-01
1,127290,140952,58,,2018-01-01
2,66991,198453,89,,2018-01-01
3,46791,83486,23,5.0,2018-01-01
4,79313,188770,88,5.0,2018-01-01
...,...,...,...,...,...
1532993,153908,98585,44,,2019-02-11
1532994,154008,251969,4,,2018-04-08
1532995,154892,298192,68,5.0,2019-02-16
1532996,156948,38118,78,5.0,2018-08-19


In [5]:
interactions["interest_score"] = ((interactions["progress"].fillna(0.0) / 100) + (interactions["rating"].astype(float).fillna(0.0) / 5)) / 2

In [6]:
interactions

Unnamed: 0,user_id,item_id,progress,rating,start_date,interest_score
0,126706,14433,80,,2018-01-01,0.400
1,127290,140952,58,,2018-01-01,0.290
2,66991,198453,89,,2018-01-01,0.445
3,46791,83486,23,5.0,2018-01-01,0.615
4,79313,188770,88,5.0,2018-01-01,0.940
...,...,...,...,...,...,...
1532993,153908,98585,44,,2019-02-11,0.220
1532994,154008,251969,4,,2018-04-08,0.020
1532995,154892,298192,68,5.0,2019-02-16,0.840
1532996,156948,38118,78,5.0,2018-08-19,0.890


In [7]:
train, test = DatasetProcessor.split_train_test(interactions, "start_date", Config.TEST_DAYS)

In [8]:
assert len(test) + len(train) == len(interactions)

In [9]:
class BaseLine(ABC):
    def __init__(self, max_K:int):
        self.max_K = max_K
        self.recommendations = []
    
    def __validate_max_K(self, K):
        if K > self.max_K:
            raise ValueError(f"There are only {self.max_K} recommendations in the model.")
    
    @abstractmethod
    def fit(self, train: pd.DataFrame) -> None:
        pass

    def recommend(self, K: int) -> pd.Series:
        self.__validate_max_K(K)
        return self.recommendations[:K]
        
    
class FreqBaseLine(BaseLine):
    def fit(self, train):
        self.recommendations = pd.Series(train["item_id"].value_counts().head(self.max_K).index)

    
class RatingBaseLine(BaseLine):
    def fit(self, train):
        self.recommendations = pd.Series(
            train[~train["rating"].isnull()].groupby("item_id")["rating"].sum(["rating"]).sort_values(ascending=False).head(self.max_K).index
        )


class InterestScoreBaseline(BaseLine):
    def fit(self, train):
        self.recommendations = pd.Series(
            train.groupby("item_id")["interest_score"].sum(["interest_score"]).sort_values(ascending=False).head(self.max_K).index
        )

In [10]:
freq_model = FreqBaseLine(Config.BASE_RECS_COUNT)
freq_model.fit(train)

In [11]:
rating_model = RatingBaseLine(Config.BASE_RECS_COUNT)
rating_model.fit(train)

In [12]:
score_model = InterestScoreBaseline(Config.BASE_RECS_COUNT)
score_model.fit(train)

In [13]:
models = {
    "freq": freq_model,
    "rating": rating_model,
    "score": score_model
}

In [14]:
precision = PrecisionAtK(Config.K)
recall = RecallAtK(Config.K)
metrics = {
    "precision": precision,
    "recall": recall
}

In [15]:
for model_name, model in models.items():
    print(f"Model '{model_name}':")
    recs = model.recommend(Config.K)
    for metric_name, metric in metrics.items():
        metric_val = test["user_id"].apply(
            lambda user: 
                metric.calculate(recs, test[test["user_id"] == user]["item_id"])
        ).mean()
        print(f"\tMetric {metric_name}@{Config.K}: {metric_val}")

Model 'freq':
	Metric precision@10: 0.0006264985691082054
	Metric recall@10: 0.0030164745920024747
Model 'rating':
	Metric precision@10: 0.0004331348132106115
	Metric recall@10: 0.002629747080207286
Model 'score':
	Metric precision@10: 0.0006961095212313391
	Metric recall@10: 0.0036352386108747786


In [16]:
with Config.BASELINE_PATH.open("wb") as file_object:
    pickle.dump(score_model, file_object)