In [1]:
import pickle
import pandas as pd

from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
from typing import Callable
from src.config import Config
from src.dataset import DatasetProcessor
from src.metrics import PrecisionAtK, RecallAtK
from src.optimizer import OptunaMaximizer

In [2]:
with Config.PREPROCESSED_INTERACTIONS_PATH.open("rb") as file_object:
    interactions = pickle.load(file_object)

In [3]:
interactions

Unnamed: 0,user_id,item_id,progress,rating,start_date,interest_score
0,126706,14433,80,,2018-01-01,0.400
1,127290,140952,58,,2018-01-01,0.290
2,66991,198453,89,,2018-01-01,0.445
3,46791,83486,23,5.0,2018-01-01,0.615
4,79313,188770,88,5.0,2018-01-01,0.940
...,...,...,...,...,...,...
1532993,153908,98585,44,,2019-02-11,0.220
1532994,154008,251969,4,,2018-04-08,0.020
1532995,154892,298192,68,5.0,2019-02-16,0.840
1532996,156948,38118,78,5.0,2018-08-19,0.890


In [4]:
train, test = DatasetProcessor.split_train_test(interactions, "start_date", Config.TEST_DAYS)
train, valid = DatasetProcessor.split_train_test(train, "start_date", Config.TEST_DAYS * 2)
train = train[["user_id", "item_id", "interest_score"]]
test = test[["user_id", "item_id", "interest_score"]]
valid = valid[["user_id", "item_id", "interest_score"]]

In [5]:
train

Unnamed: 0,user_id,item_id,interest_score
0,126706,14433,0.400
1,127290,140952,0.290
2,66991,198453,0.445
3,46791,83486,0.615
4,79313,188770,0.940
...,...,...,...
1532993,153908,98585,0.220
1532994,154008,251969,0.020
1532995,154892,298192,0.840
1532996,156948,38118,0.890


In [6]:
test

Unnamed: 0,user_id,item_id,interest_score
1517914,38753,135245,0.000
1517915,101642,319500,0.835
1517916,13548,251184,0.000
1517917,130425,193445,0.490
1517918,93986,80733,0.235
...,...,...,...
1530838,141930,219928,0.450
1530839,53358,42887,0.290
1530840,151170,284652,0.135
1530841,141293,273421,0.240


In [7]:
valid

Unnamed: 0,user_id,item_id,interest_score
1503047,22032,287219,0.275
1503048,84214,121609,1.000
1503049,28992,11482,0.110
1503050,23345,281921,1.000
1503051,49466,2880,0.010
...,...,...,...
1517909,138587,291806,0.000
1517910,158991,99669,0.815
1517911,77232,142149,0.020
1517912,17843,174535,0.060


In [8]:
csr_train = csr_matrix(
    (train["interest_score"], (train["user_id"], train["item_id"]))
)
csr_train

<159613x321752 sparse matrix of type '<class 'numpy.float32'>'
	with 1505202 stored elements in Compressed Sparse Row format>

In [9]:
class Objective:
    def __init__(self, train: csr_matrix, valid: pd.DataFrame, metric: Callable):
        self.train = train
        self.valid = valid
        self.metric = metric

    def __call__(self, trial) -> float:
        search_space = {
            "factors": trial.suggest_int("factors", 4, 256),
            "regularization": trial.suggest_float("regularization", 1e-8, 0.1),
            "iterations": trial.suggest_int("iterations", 8, 64)
        }
        model = AlternatingLeastSquares(**search_space)
        model.fit(self.train)
        predictions = pd.DataFrame(self.valid["user_id"].drop_duplicates())
        predictions["item_id"] = predictions["user_id"].apply(
            lambda user_id: 
                model.recommend(
                    user_id, 
                    self.train[user_id],
                    N=Config.K,
                    filter_already_liked_items=True
                )[0]
        )
        predictions = predictions.explode("item_id")
        return self.valid["user_id"].apply(
            lambda user: 
                self.metric.calculate(
                    predictions[predictions["user_id"] == user]["item_id"],
                    test[test["user_id"] == user]["item_id"]
                )
        ).mean()

In [10]:
recall = RecallAtK(Config.K)
precision = PrecisionAtK(Config.K)

In [11]:
precision_objective = Objective(csr_train, valid, precision)
precision_optimizer = OptunaMaximizer(precision_objective, "ALS precision maximize")
precision_best_trial = precision_optimizer.optimize(Config.ITERATIONS, Config.STOP_ITERATION)

[32m[I 2023-02-21 21:14:18,412][0m A new study created in memory with name: ALS precision maximize[0m
  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:14:56,715][0m Trial 0 finished with value: 0.0024752808232999333 and parameters: {'factors': 59, 'regularization': 0.03583772580478882, 'iterations': 60}. Best is trial 0 with value: 0.0024752808232999333.[0m


  0%|          | 0/16 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:16:38,909][0m Trial 1 finished with value: 0.002717427860361891 and parameters: {'factors': 248, 'regularization': 0.014791039334287047, 'iterations': 16}. Best is trial 1 with value: 0.002717427860361891.[0m


  0%|          | 0/29 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:18:05,232][0m Trial 2 finished with value: 0.002999932736934167 and parameters: {'factors': 192, 'regularization': 0.06735920132829787, 'iterations': 29}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/31 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:19:53,475][0m Trial 3 finished with value: 0.0027779646196273776 and parameters: {'factors': 239, 'regularization': 0.06819102975013004, 'iterations': 31}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/53 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:21:25,506][0m Trial 4 finished with value: 0.002576175422075749 and parameters: {'factors': 78, 'regularization': 0.08056762803262783, 'iterations': 53}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/53 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:24:15,305][0m Trial 5 finished with value: 0.0029326696710836224 and parameters: {'factors': 145, 'regularization': 0.02778078706857992, 'iterations': 53}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/33 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:24:29,671][0m Trial 6 finished with value: 0.0025559965023205854 and parameters: {'factors': 117, 'regularization': 0.005396825793810861, 'iterations': 33}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/34 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:24:38,391][0m Trial 7 finished with value: 0.0022734916257483063 and parameters: {'factors': 18, 'regularization': 0.08316781329657846, 'iterations': 34}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/37 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:24:51,856][0m Trial 8 finished with value: 0.0025963543418309116 and parameters: {'factors': 99, 'regularization': 0.0980265463340844, 'iterations': 37}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/39 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:25:05,866][0m Trial 9 finished with value: 0.002993206430349111 and parameters: {'factors': 104, 'regularization': 0.03783694371020796, 'iterations': 39}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:25:21,115][0m Trial 10 finished with value: 0.0028452276854779193 and parameters: {'factors': 183, 'regularization': 0.05447786856628104, 'iterations': 8}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/21 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:25:37,995][0m Trial 11 finished with value: 0.002912490751328462 and parameters: {'factors': 178, 'regularization': 0.0488676091827696, 'iterations': 21}. Best is trial 2 with value: 0.002999932736934167.[0m


  0%|          | 0/44 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:25:58,472][0m Trial 12 finished with value: 0.002650164794511349 and parameters: {'factors': 187, 'regularization': 0.05295882610394445, 'iterations': 44}. Best is trial 2 with value: 0.002999932736934167.[0m
EarlyStopping Exceeded: No new best scores on iters 10


In [12]:
recall_objective = Objective(csr_train, valid, recall)
recall_optimizer = OptunaMaximizer(recall_objective, "ALS recall maximize")
recall_best_trial = recall_optimizer.optimize(Config.ITERATIONS, Config.STOP_ITERATION)

[32m[I 2023-02-21 21:25:58,563][0m A new study created in memory with name: ALS recall maximize[0m
  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:26:10,250][0m Trial 0 finished with value: 0.007753072489322144 and parameters: {'factors': 65, 'regularization': 0.0948623418988534, 'iterations': 19}. Best is trial 0 with value: 0.007753072489322144.[0m


  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:26:27,768][0m Trial 1 finished with value: 0.010017261275080595 and parameters: {'factors': 197, 'regularization': 0.09219463578048416, 'iterations': 17}. Best is trial 1 with value: 0.010017261275080595.[0m


  0%|          | 0/62 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:26:41,931][0m Trial 2 finished with value: 0.007029499950062269 and parameters: {'factors': 86, 'regularization': 0.0817107907573585, 'iterations': 62}. Best is trial 1 with value: 0.010017261275080595.[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:26:52,247][0m Trial 3 finished with value: 0.007102178574824274 and parameters: {'factors': 52, 'regularization': 0.027884326559990107, 'iterations': 20}. Best is trial 1 with value: 0.010017261275080595.[0m


  0%|          | 0/15 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:27:09,774][0m Trial 4 finished with value: 0.009188920986862725 and parameters: {'factors': 205, 'regularization': 0.008972640602755941, 'iterations': 15}. Best is trial 1 with value: 0.010017261275080595.[0m


  0%|          | 0/16 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:27:22,864][0m Trial 5 finished with value: 0.008396075368821965 and parameters: {'factors': 110, 'regularization': 0.07978480706655185, 'iterations': 16}. Best is trial 1 with value: 0.010017261275080595.[0m


  0%|          | 0/36 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:27:41,416][0m Trial 6 finished with value: 0.01060341084892103 and parameters: {'factors': 172, 'regularization': 0.033765587757901026, 'iterations': 36}. Best is trial 6 with value: 0.01060341084892103.[0m


  0%|          | 0/22 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:27:53,840][0m Trial 7 finished with value: 0.007706468793697123 and parameters: {'factors': 95, 'regularization': 0.06229437908321385, 'iterations': 22}. Best is trial 6 with value: 0.01060341084892103.[0m


  0%|          | 0/35 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:28:05,989][0m Trial 8 finished with value: 0.007884301411912903 and parameters: {'factors': 78, 'regularization': 0.011191689291008946, 'iterations': 35}. Best is trial 6 with value: 0.01060341084892103.[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:28:26,429][0m Trial 9 finished with value: 0.009590461107243235 and parameters: {'factors': 235, 'regularization': 0.05849662009934217, 'iterations': 20}. Best is trial 6 with value: 0.01060341084892103.[0m


  0%|          | 0/50 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:28:35,764][0m Trial 10 finished with value: 0.005117779482674535 and parameters: {'factors': 9, 'regularization': 0.0375343482955951, 'iterations': 50}. Best is trial 6 with value: 0.01060341084892103.[0m


  0%|          | 0/34 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:28:54,783][0m Trial 11 finished with value: 0.009156730805348541 and parameters: {'factors': 174, 'regularization': 0.04391978907583533, 'iterations': 34}. Best is trial 6 with value: 0.01060341084892103.[0m


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:29:10,277][0m Trial 12 finished with value: 0.010924538604971768 and parameters: {'factors': 166, 'regularization': 0.09963978621255527, 'iterations': 8}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:29:25,302][0m Trial 13 finished with value: 0.009288141288544854 and parameters: {'factors': 149, 'regularization': 0.06497280287627577, 'iterations': 8}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/45 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:29:45,990][0m Trial 14 finished with value: 0.008627878589874955 and parameters: {'factors': 146, 'regularization': 0.029640300935685218, 'iterations': 45}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/27 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:30:09,935][0m Trial 15 finished with value: 0.009770025830764377 and parameters: {'factors': 248, 'regularization': 0.05447601459215451, 'iterations': 27}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/47 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:30:31,615][0m Trial 16 finished with value: 0.009882494918144333 and parameters: {'factors': 177, 'regularization': 0.07461224682801348, 'iterations': 47}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/63 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:30:49,959][0m Trial 17 finished with value: 0.008841679049185612 and parameters: {'factors': 124, 'regularization': 0.0971354040147796, 'iterations': 63}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/41 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:31:14,123][0m Trial 18 finished with value: 0.008704634192291422 and parameters: {'factors': 223, 'regularization': 0.04745639476905117, 'iterations': 41}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/56 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:31:35,930][0m Trial 19 finished with value: 0.008304695573478777 and parameters: {'factors': 167, 'regularization': 0.07218739676794197, 'iterations': 56}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:31:56,869][0m Trial 20 finished with value: 0.008832827113246835 and parameters: {'factors': 199, 'regularization': 0.08793453606429749, 'iterations': 30}. Best is trial 12 with value: 0.010924538604971768.[0m


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:32:14,272][0m Trial 21 finished with value: 0.011404314503460246 and parameters: {'factors': 200, 'regularization': 0.09807384401645601, 'iterations': 8}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:32:29,483][0m Trial 22 finished with value: 0.008449791615527806 and parameters: {'factors': 144, 'regularization': 0.08635397057276926, 'iterations': 8}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/11 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:32:48,383][0m Trial 23 finished with value: 0.010431489655610178 and parameters: {'factors': 213, 'regularization': 0.09496149438746018, 'iterations': 11}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/29 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:33:07,646][0m Trial 24 finished with value: 0.010246312397048915 and parameters: {'factors': 182, 'regularization': 0.09766598116763801, 'iterations': 29}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/13 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:33:23,711][0m Trial 25 finished with value: 0.009509024752517036 and parameters: {'factors': 160, 'regularization': 0.09946132054890841, 'iterations': 13}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/40 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:33:40,205][0m Trial 26 finished with value: 0.008871343225954112 and parameters: {'factors': 127, 'regularization': 0.0860939450510295, 'iterations': 40}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/24 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:34:03,353][0m Trial 27 finished with value: 0.010511440981609802 and parameters: {'factors': 252, 'regularization': 0.07566923011478209, 'iterations': 24}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/12 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:34:22,723][0m Trial 28 finished with value: 0.010078919085443595 and parameters: {'factors': 226, 'regularization': 0.06772303347100911, 'iterations': 12}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/53 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:34:46,233][0m Trial 29 finished with value: 0.008596569234223101 and parameters: {'factors': 190, 'regularization': 0.09985623744814402, 'iterations': 53}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/8 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:35:01,903][0m Trial 30 finished with value: 0.009020726924473475 and parameters: {'factors': 159, 'regularization': 0.09053485732499203, 'iterations': 8}. Best is trial 21 with value: 0.011404314503460246.[0m


  0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2023-02-21 21:35:25,758][0m Trial 31 finished with value: 0.010030910436170402 and parameters: {'factors': 251, 'regularization': 0.07871978908718259, 'iterations': 25}. Best is trial 21 with value: 0.011404314503460246.[0m
EarlyStopping Exceeded: No new best scores on iters 10
