In [15]:
import os
import pathlib
import sys

import numpy as np
import pandas as pd

project_path = pathlib.Path(os.getcwd()).parent
sys.path.append(project_path.as_posix())

from src.data import prepare_df_min_len_count, train_test_split_stratify
from src.eval import common_metrics, diversity_at_k
from src.model.baseline import RecWord2Vec

In [16]:
DATA = pathlib.Path("../data")
DESC_DATA = DATA / "raw" / "rec_aaa_title_desc.pq"

BUYER_DATA_PROCESSED = DATA / "processed" / "rec_aaa_buyer_stream_processed.pq"

In [17]:
MIN_LEN_SESSION = 2
MIN_COUNT_ITEM = 5

In [18]:
buyer_stream = pd.read_parquet(BUYER_DATA_PROCESSED)
buyer_stream.head()

Unnamed: 0_level_0,user_id,event_date,eid,category_id,microcat_id,internal_item_id,item_id,user_hash,x,conctact,session_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
50313,15850,2024-09-01 09:19:06,4813,29,2179585,1880802250341,4126988312,6,,True,1
50314,15850,2024-09-01 13:49:05,4813,29,2179585,1880802250341,4126988312,6,,True,1
54366,15850,2024-09-01 16:56:45,4813,29,2179579,1881675250689,4208903128,6,,True,1
17038,15850,2024-09-01 18:03:10,4813,106,19,1753602251163,4305669889,6,,True,1
65080,15850,2024-09-06 07:13:25,4675,27,1144483,1891517757037,4293355912,6,8055083000000.0,True,2


### Data split

In [19]:
X = prepare_df_min_len_count(
    buyer_stream, min_len_session=MIN_LEN_SESSION, min_count_item=MIN_COUNT_ITEM
)

Итерация 0: 42438 записей
Итерация 1: 1549 записей
Итерация 2: 1402 записей


In [20]:
test_train_ratio = 0.3
train_sessions, test_sessions = train_test_split_stratify(
    X, test_train_ratio=test_train_ratio
)

In [21]:
train_sessions = train_sessions["item_id"].values.tolist()
train_sessions = list(map(lambda x: list(map(str, x)), train_sessions))

test_sessions = test_sessions["item_id"].values.tolist()
test_sessions = list(map(lambda x: list(map(str, x)), test_sessions))

### Train

In [22]:
model = RecWord2Vec(verbose=True)

In [23]:
model.fit(sessions=train_sessions, epochs=30)

Всего уникальных объявлений: 115, 7 наиболее встречаемых: 770555412, 4530492123, 4522091877, 4457644771, 4371360089, 4354204098, 4349556599
Время постройки словаря: 0.0 сек
Время обучения: 0.16 сек


### Eval

In [24]:
K = 5

In [25]:
predict_rec = model.predict(test_sessions, topn=K)
true_rec = list(map(lambda x: x[-(K + 1) : -1], test_sessions))

In [26]:
recs_emb = np.array(
    [[model.model.wv.get_vector(rec) for rec in recs] for recs in predict_rec]
)

In [27]:
print(common_metrics(true_rec, predict_rec, k=K))

precision: 0.035
recall: 0.045
ndcg: 0.1061521163675112


In [28]:
print(f"diversity: {diversity_at_k(recs_emb, k=K)}")

diversity: 0.002120077610015869
