In [2]:
import implicit
import pickle
import pandas as pd

from scipy.sparse import csr_matrix
from src.config import Config
from src.dataset import Interactions
from src.metrics import PrecisionAtK, RecallAtK

In [3]:
with Config.PREPROCESSED_INTERACTIONS_PATH.open("rb") as file_object:
    interactions = Interactions(pickle.load(file_object))

In [4]:
interactions.train = interactions.train[["user_id", "item_id", "interest_score"]]
interactions.test = interactions.test[["user_id", "item_id", "interest_score"]]

In [5]:
interactions.train

Unnamed: 0,user_id,item_id,interest_score
0,126706,14433,0.400
1,127290,140952,0.290
2,66991,198453,0.445
3,46791,83486,0.615
4,79313,188770,0.940
...,...,...,...
1532993,153908,98585,0.220
1532994,154008,251969,0.020
1532995,154892,298192,0.840
1532996,156948,38118,0.890


In [6]:
interactions.test

Unnamed: 0,user_id,item_id,interest_score
1517914,38753,135245,0.000
1517915,101642,319500,0.835
1517916,13548,251184,0.000
1517917,130425,193445,0.490
1517918,93986,80733,0.235
...,...,...,...
1530838,141930,219928,0.450
1530839,53358,42887,0.290
1530840,151170,284652,0.135
1530841,141293,273421,0.240


In [7]:
csr_train = csr_matrix(
    (interactions.train["interest_score"],
    (interactions.train["user_id"], interactions.train["item_id"]))
)
csr_train

<159613x321752 sparse matrix of type '<class 'numpy.float32'>'
	with 1505202 stored elements in Compressed Sparse Row format>

In [8]:
precision = PrecisionAtK(Config.K)
recall = RecallAtK(Config.K)
metrics = {
    "precision": precision,
    "recall": recall
}

In [9]:
bm = implicit.nearest_neighbours.BM25Recommender(K=Config.BASE_RECS_COUNT)
bm.fit(csr_train)

  0%|          | 0/321752 [00:00<?, ?it/s]

In [10]:
cosine = implicit.nearest_neighbours.CosineRecommender(K=Config.BASE_RECS_COUNT)
cosine.fit(csr_train)

  0%|          | 0/321752 [00:00<?, ?it/s]

In [11]:
tf = implicit.nearest_neighbours.TFIDFRecommender(K=Config.BASE_RECS_COUNT)
tf.fit(csr_train)

  0%|          | 0/321752 [00:00<?, ?it/s]

In [12]:
models = {
    "cosine": cosine,
    "tf": tf,
    "bm": bm
}

In [13]:
for model_name, model in models.items():
    pred = pd.DataFrame(interactions.test["user_id"].drop_duplicates())
    print(f"Model '{model_name}':")
    pred["item_id"] = pred["user_id"].apply(
    lambda user_id: 
        model.recommend(
            user_id, 
            csr_train[user_id],
            N=Config.K,
            filter_already_liked_items=True
        )[0]
    )
    pred = pred.explode("item_id")
    for metric_name, metric in metrics.items():
        metric_val = interactions.test["user_id"].drop_duplicates().apply(lambda user: 
            metric.calculate(
                pred[pred["user_id"] == user]["item_id"],
                interactions.test[interactions.test["user_id"] == user]["item_id"]
            )
        ).mean()
        print(f"\tMetric {metric_name}@{Config.K}: {metric_val}")

Model 'cosine':
	Metric precision@10: 0.0019375330862890418
	Metric recall@10: 0.009836015405099575
Model 'tf':
	Metric precision@10: 0.0021598729486500823
	Metric recall@10: 0.010758316210845371
Model 'bm':
	Metric precision@10: 0.0011116993118051863
	Metric recall@10: 0.0061159272807039654


In [14]:
with Config.IMPLICIT_NN_PATH.open("wb") as file_object:
    pickle.dump(tf, file_object)