In [1]:
import os
os.chdir("../")
import pickle
import pandas as pd
import numpy as np

from lightfm import LightFM
from src.config import Config
from src.dataset import DatasetProcessor
from scipy.sparse import csr_matrix
from src.metrics import PrecisionAtK, RecallAtK

In [3]:
with Config.PREPROCESSED_INTERACTIONS_PATH.open("rb") as file_object:
    interactions = pickle.load(file_object)

In [4]:
interactions

Unnamed: 0,user_id,item_id,progress,rating,start_date,interest_score
0,126706,14433,80,,2018-01-01,0.400
1,127290,140952,58,,2018-01-01,0.290
2,66991,198453,89,,2018-01-01,0.445
3,46791,83486,23,5.0,2018-01-01,0.615
4,79313,188770,88,5.0,2018-01-01,0.940
...,...,...,...,...,...,...
1532993,153908,98585,44,,2019-02-11,0.220
1532994,154008,251969,4,,2018-04-08,0.020
1532995,154892,298192,68,5.0,2019-02-16,0.840
1532996,156948,38118,78,5.0,2018-08-19,0.890


In [5]:
train, test = DatasetProcessor.split_train_test(interactions, "start_date", Config.TEST_DAYS)
train, valid = DatasetProcessor.split_train_test(train, "start_date", Config.TEST_DAYS * 2)
train = train[["user_id", "item_id", "interest_score"]]
test = test[["user_id", "item_id", "interest_score"]]
valid = valid[["user_id", "item_id", "interest_score"]]

In [6]:
train

Unnamed: 0,user_id,item_id,interest_score
0,126706,14433,0.400
1,127290,140952,0.290
2,66991,198453,0.445
3,46791,83486,0.615
4,79313,188770,0.940
...,...,...,...
1532993,153908,98585,0.220
1532994,154008,251969,0.020
1532995,154892,298192,0.840
1532996,156948,38118,0.890


In [21]:
test

Unnamed: 0,level_0,index,user_id,item_id,interest_score
0,0,1517914,38753,135245,0.000
1,1,1517915,101642,319500,0.835
2,2,1517916,13548,251184,0.000
3,3,1517917,130425,193445,0.490
4,4,1517918,93986,80733,0.235
...,...,...,...,...,...
12924,12924,1530838,141930,219928,0.450
12925,12925,1530839,53358,42887,0.290
12926,12926,1530840,151170,284652,0.135
12927,12927,1530841,141293,273421,0.240


In [8]:
valid

Unnamed: 0,user_id,item_id,interest_score
1503047,22032,287219,0.275
1503048,84214,121609,1.000
1503049,28992,11482,0.110
1503050,23345,281921,1.000
1503051,49466,2880,0.010
...,...,...,...
1517909,138587,291806,0.000
1517910,158991,99669,0.815
1517911,77232,142149,0.020
1517912,17843,174535,0.060


In [9]:
csr_train = csr_matrix(
    (train["interest_score"], (train["user_id"], train["item_id"]))
)
csr_train

<159613x321752 sparse matrix of type '<class 'numpy.float32'>'
	with 1505202 stored elements in Compressed Sparse Row format>

In [10]:
model = LightFM(no_components=30)
model.fit(csr_train, epochs=20)

<lightfm.lightfm.LightFM at 0x7f1efc433250>

In [11]:
predictions = pd.DataFrame(test["user_id"].drop_duplicates())
predictions["item_id"] = predictions["user_id"].apply(
    lambda user_id:
        np.argpartition(
            model.predict(
                user_id,
                train["item_id"].unique()
            ),
            -Config.K
        )[-Config.K:]
)

In [52]:
items = pd.Series(train["item_id"].unique(), name="item_id")
items

0         14433
1        140952
2        198453
3         83486
4        188770
          ...  
59594    278387
59595    270122
59596    147325
59597    305201
59598    156866
Name: item_id, Length: 59599, dtype: int64

In [12]:
predictions

Unnamed: 0,user_id,item_id
1517914,38753,"[2441, 17253, 1965, 651, 4681, 36240, 7116, 23..."
1517915,101642,"[205, 29651, 1230, 2605, 5432, 20530, 4525, 70..."
1517916,13548,"[33995, 26437, 34407, 27696, 9017, 32511, 1833..."
1517917,130425,"[26437, 34407, 9017, 4525, 22256, 22800, 2125,..."
1517918,93986,"[2367, 2692, 29194, 4681, 2642, 1075, 4525, 47..."
...,...,...
1530838,141930,"[26437, 33995, 2125, 27696, 32511, 22256, 2280..."
1530839,53358,"[33995, 32511, 26437, 2125, 34407, 22256, 2769..."
1530840,151170,"[26437, 34407, 32511, 27696, 2125, 22800, 2225..."
1530841,141293,"[2876, 34407, 1965, 2642, 27841, 27696, 9017, ..."


In [13]:
predictions = predictions.explode("item_id")

In [14]:
precision = PrecisionAtK(Config.K)
recall = RecallAtK(Config.K)
metrics = {
    "precision": precision,
    "recall": recall
}

In [56]:
for metric_name, metric in metrics.items():
    metric_val = predictions["user_id"].apply(
        lambda user: 
            metric.calculate(
                pd.Series(items.iloc[predictions[predictions["user_id"] == user]["item_id"]]),
                test[test["user_id"] == user]["item_id"]
        )
    ).mean()
    print(f"\tMetric {metric_name}@{Config.K}: {metric_val}")

	Metric precision@10: 0.000942297511911064
	Metric recall@10: 0.007194282689253573


# To Do
- Add embeddings
- Optimize hyperparameters