In [1]:
import os
import pathlib
import sys

import pandas as pd
from implicit.evaluation import mean_average_precision_at_k, precision_at_k
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

project_path = pathlib.Path(os.getcwd()).parent
sys.path.append(project_path.as_posix())

from src.model.collaborative import ALS
from src.model.collaborative.utils import create_mappings, save_csr_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
DATA = pathlib.Path("../data")
DESC_DATA = DATA / "raw" / "rec_aaa_title_desc.pq"

BUYER_DATA = DATA / "raw" / "rec_aaa_buyer_stream.pq"
BUYER_DATA_EXTENDED = DATA / "raw" / "buyer_stream_extended.parquet"

In [3]:
TRAINED = pathlib.Path("../src/model/collaborative/trained")
DATA_COLLABORATIVE = DATA / "processed" / "collaborative"

MAPPING = TRAINED / "mapping.pkl"
MODEL = TRAINED / "als_model.pkl"
TRAIN_MATRIX = DATA_COLLABORATIVE / "train_matrix.pkl"
EVAL_MATRIX = DATA_COLLABORATIVE / "eval_matrix.pkl"

In [6]:
buyer_stream = pd.read_parquet(BUYER_DATA)
buyer_stream_extended = pd.read_parquet(BUYER_DATA_EXTENDED)

In [9]:
print(buyer_stream.event_date.min())
print(buyer_stream.event_date.max())

2024-09-01 00:01:45
2024-10-25 23:59:00


In [None]:
print(buyer_stream_extended.event_date.min())
print(buyer_stream_extended.event_date.max())

2025-02-25 00:00:00
2025-02-28 23:59:59


In [5]:
buyer_stream = pd.read_parquet(BUYER_DATA_PROCESSED)

In [7]:
buyer_stream

Unnamed: 0,user_id,event_date,eid,category_id,microcat_id,internal_item_id,item_id,user_hash,x,conctact,session_id
50313,15850,2024-09-01 09:19:06,4813,29,2179585,1880802250341,4126988312,6,,True,1
50314,15850,2024-09-01 13:49:05,4813,29,2179585,1880802250341,4126988312,6,,True,1
54366,15850,2024-09-01 16:56:45,4813,29,2179579,1881675250689,4208903128,6,,True,1
17038,15850,2024-09-01 18:03:10,4813,106,19,1753602251163,4305669889,6,,True,1
65080,15850,2024-09-06 07:13:25,4675,27,1144483,1891517757037,4293355912,6,8.055083e+12,True,2
...,...,...,...,...,...,...,...,...,...,...,...
65079,1702546250012,2024-10-16 23:38:16,857,101,3841,1930517250112,4359474466,39,8.264518e+12,True,19850
11942,1702546250012,2024-10-16 23:39:28,857,27,1178044,1930266250050,4168550717,39,8.264513e+12,True,19850
1339,1702546250012,2024-10-16 23:43:17,857,9,21777,1669856001035,3792232410,39,8.264666e+12,True,19850
40876,1702546250012,2024-10-16 23:46:00,857,9,21753,1912986503800,4357178651,39,,True,19850


### Data split

In [8]:
interactions_df = (
    buyer_stream.groupby(["user_id", "item_id"])
    .size()
    .reset_index(name="interaction_count")
)

In [9]:
interactions_df

Unnamed: 0,user_id,item_id,interaction_count
0,15850,2046139836,2
1,15850,2408812176,3
2,15850,2828827745,1
3,15850,3240409049,1
4,15850,3558668724,1
...,...,...,...
46392,1702546250012,4168550717,1
46393,1702546250012,4357178651,1
46394,1702546250012,4359474466,1
46395,1702546250012,4420513948,1


In [10]:
item_counts = interactions_df["item_id"].value_counts()
user_counts = interactions_df["user_id"].value_counts()

In [6]:
interactions_df = (
    buyer_stream.groupby(["user_id", "item_id"])
    .size()
    .reset_index(name="interaction_count")
)

item_counts = interactions_df["item_id"].value_counts()
user_counts = interactions_df["user_id"].value_counts()

filtered_interactions = interactions_df[
    interactions_df["item_id"].isin(
        item_counts[item_counts >= MIN_ITEM_INTERACTIONS].index
    )
    & interactions_df["user_id"].isin(
        user_counts[user_counts >= MIN_USER_INTERACTIONS].index
    )
]

In [12]:
train_data, eval_data = train_test_split(
    filtered_interactions,
    test_size=0.2,
    random_state=42,
    stratify=filtered_interactions["user_id"],
)
eval_data = eval_data[eval_data["item_id"].isin(train_data["item_id"].unique())]

In [13]:
train_data

Unnamed: 0,user_id,item_id,interaction_count
19885,452514250022,4169368198,2
8350,7054500081,4201542389,1
4020,372399197,4372012068,1
5576,380018120,4387712026,1
28914,766398000061,2240201661,1
...,...,...,...
13284,217631250094,4016121656,1
31707,862495000018,4489910755,2
33577,936693000004,4293704977,1
20002,453743500173,3861267764,1


### Mapping

In [33]:
train_data, user_to_index, item_to_index = create_mappings(
    train_data, save_path=MAPPING
)
eval_data, _, _ = create_mappings(eval_data, user_to_index, item_to_index)

In [34]:
train_matrix = csr_matrix(
    (
        train_data["interaction_count"].values,
        (train_data["user_idx"].values, train_data["item_idx"].values),
    )
)
eval_matrix = csr_matrix(
    (
        eval_data["interaction_count"].values,
        (eval_data["user_idx"].values, eval_data["item_idx"].values),
    )
)
save_csr_matrix(
    train_matrix, eval_matrix, save_path_eval=EVAL_MATRIX, save_path_train=TRAIN_MATRIX
)

### Train

In [35]:
als = ALS()

In [36]:
als.train(train_matrix, show_progress=True, save_path=MODEL)

100%|██████████| 10/10 [00:01<00:00,  7.40it/s]


### Eval

In [37]:
precision = precision_at_k(als.model, train_matrix, eval_matrix, K=5)
map_score = mean_average_precision_at_k(als.model, train_matrix, eval_matrix, K=5)
print(f"precision_at_k: {precision}\nmean_average_precision_at_k: {map_score}")

100%|██████████| 66/66 [00:00<00:00, 4148.48it/s]
100%|██████████| 66/66 [00:00<00:00, 1995.39it/s]

precision_at_k: 0.11235955056179775
mean_average_precision_at_k: 0.027954545454545454





In [38]:
als.load_mapping(MAPPING)

In [39]:
sample_item_id = 4802219291

train_items = set(train_data["item_id"].unique())
if sample_item_id not in train_items:
    print(f"Товар {sample_item_id} отсутствует в обучающих данных")
else:
    similar_items = als.get_similar_items(sample_item_id, verbose=True)
    print(f"\nТовары, похожие на {sample_item_id}:")
    for item_id, score in similar_items:
        print(f"{item_id}: {score:.3f}")


Товары, похожие на 4802219291:
3742813120: 1.000
4683851490: 1.000
4452170017: 1.000
3453494208: 1.000
4217601482: 1.000


In [40]:
sample_user_id = 15850
user_recommendations = als.get_user_recommendations(
    sample_user_id, train_matrix=train_matrix, N=5, verbose=True
)

print(f"\nРекомендации для пользователя {sample_user_id}:")
for item_id, score in user_recommendations:
    print(f"{item_id}: {score:.3f}")


Рекомендации для пользователя 15850:
2229440076: 0.002
3661786505: 0.002
4191096072: 0.002
3675416651: 0.002
4370010818: 0.002


In [41]:
sample_user_id = 15850
user_recommendations = als.get_user_recommendations_with_similar_users(
    sample_user_id, train_matrix=train_matrix, N=5, verbose=True
)

print(f"\nРекомендации для пользователя {sample_user_id}:")
for item_id, score in user_recommendations:
    print(f"{item_id}: {score:.3f}")


Рекомендации для пользователя 15850:
4191096072: 0.005
2229440076: 0.005
3661786505: 0.004
4181371927: 0.004
4490385824: 0.004
