In [1]:
import implicit
import pickle
import pandas as pd

from scipy.sparse import csr_matrix
from src.config import Config
from src.dataset import DatasetProcessor
from src.metrics import PrecisionAtK, RecallAtK

In [2]:
with Config.PREPROCESSED_INTERACTIONS_PATH.open("rb") as file_object:
    interactions = pickle.load(file_object)

In [3]:
interactions["interest_score"] = ((interactions["progress"].fillna(0.0) / 100) + (interactions["rating"].astype(float).fillna(0.0) / 5)) / 2

In [4]:
interactions

Unnamed: 0,user_id,item_id,progress,rating,start_date,interest_score
0,126706,14433,80,,2018-01-01,0.400
1,127290,140952,58,,2018-01-01,0.290
2,66991,198453,89,,2018-01-01,0.445
3,46791,83486,23,5.0,2018-01-01,0.615
4,79313,188770,88,5.0,2018-01-01,0.940
...,...,...,...,...,...,...
1532993,153908,98585,44,,2019-02-11,0.220
1532994,154008,251969,4,,2018-04-08,0.020
1532995,154892,298192,68,5.0,2019-02-16,0.840
1532996,156948,38118,78,5.0,2018-08-19,0.890


In [5]:
train, test = DatasetProcessor.split_train_test(interactions, "start_date", Config.TEST_DAYS)
train = train[["user_id", "item_id", "interest_score"]]
test = test[["user_id", "item_id", "interest_score"]]

In [6]:
train

Unnamed: 0,user_id,item_id,interest_score
0,126706,14433,0.400
1,127290,140952,0.290
2,66991,198453,0.445
3,46791,83486,0.615
4,79313,188770,0.940
...,...,...,...
1532993,153908,98585,0.220
1532994,154008,251969,0.020
1532995,154892,298192,0.840
1532996,156948,38118,0.890


In [7]:
test

Unnamed: 0,user_id,item_id,interest_score
1517914,38753,135245,0.000
1517915,101642,319500,0.835
1517916,13548,251184,0.000
1517917,130425,193445,0.490
1517918,93986,80733,0.235
...,...,...,...
1530838,141930,219928,0.450
1530839,53358,42887,0.290
1530840,151170,284652,0.135
1530841,141293,273421,0.240


In [8]:
test.item_id.nunique()

9513

In [9]:
csr_train = csr_matrix(
    (train["interest_score"], (train["user_id"], train["item_id"]))
)
csr_train

<159613x321752 sparse matrix of type '<class 'numpy.float64'>'
	with 1520069 stored elements in Compressed Sparse Row format>

In [10]:
csr_test = csr_matrix(
    (test["interest_score"], (test["user_id"], test["item_id"]))
)
csr_test

<159589x321708 sparse matrix of type '<class 'numpy.float64'>'
	with 12929 stored elements in Compressed Sparse Row format>

In [11]:
precision = PrecisionAtK(Config.K)
recall = RecallAtK(Config.K)
metrics = {
    "precision": precision,
    "recall": recall
}

In [12]:
bm = implicit.nearest_neighbours.BM25Recommender(K=Config.BASE_RECS_COUNT)
bm.fit(csr_train)

  0%|          | 0/321752 [00:00<?, ?it/s]

In [13]:
cosine = implicit.nearest_neighbours.CosineRecommender(K=Config.BASE_RECS_COUNT)
cosine.fit(csr_train)

  0%|          | 0/321752 [00:00<?, ?it/s]

In [14]:
tf = implicit.nearest_neighbours.TFIDFRecommender(K=Config.BASE_RECS_COUNT)
tf.fit(csr_train)

  0%|          | 0/321752 [00:00<?, ?it/s]

In [15]:
models = {
    "cosine": cosine,
    "tf": tf,
    "bm": bm
}

In [16]:
pred = pd.DataFrame(test["user_id"].drop_duplicates())
for model_name, model in models.items():
    print(f"Model '{model_name}':")
    pred["item_id"] = pred["user_id"].apply(
    lambda user_id: 
        model.recommend(
            user_id, 
            csr_train[user_id],
            N=Config.K,
            filter_already_liked_items=True
        )[0]
    )
    pred = pred.explode("item_id")
    for metric_name, metric in metrics.items():
        metric_val = test["user_id"].apply(lambda user: 
            metric.calculate(
                pred[pred["user_id"] == user]["item_id"],
                test[test["user_id"] == user]["item_id"]
            )
        ).mean()
        print(f"\tMetric {metric_name}@{Config.K}: {metric_val}")

Model 'cosine':
	Metric precision@10: 0.008113543197463036
	Metric recall@10: 0.021888777167607735
Model 'tf':
	Metric precision@10: 0.009064892876479167
	Metric recall@10: 0.02374506922422463
Model 'bm':


KeyboardInterrupt: 

In [None]:
with Config.IMPLICIT_NN_PATH.open("wb") as file_object:
    pickle.dump(tf, file_object)