In [1]:
import os
import pathlib
import sys

import pandas as pd

project_path = pathlib.Path(os.getcwd()).parent
sys.path.append(project_path.as_posix())

from collections import defaultdict
from itertools import combinations

from sklearn.model_selection import train_test_split

from src.data import prepare_df_min_len_count

In [2]:
DATA = pathlib.Path("../data")
DESC_DATA = DATA / "raw" / "rec_aaa_title_desc.pq"

BUYER_DATA_PROCESSED = DATA / "processed" / "rec_aaa_buyer_stream_processed.pq"

In [8]:
MIN_LEN_SESSION = 2
MIN_COUNT_ITEM = 1

In [3]:
buyer_stream = pd.read_parquet(BUYER_DATA_PROCESSED)
buyer_stream.head()

Unnamed: 0,user_id,event_date,eid,category_id,microcat_id,internal_item_id,item_id,user_hash,x,conctact,session_id
50313,15850,2024-09-01 09:19:06,4813,29,2179585,1880802250341,4126988312,6,,True,1
50314,15850,2024-09-01 13:49:05,4813,29,2179585,1880802250341,4126988312,6,,True,1
54366,15850,2024-09-01 16:56:45,4813,29,2179579,1881675250689,4208903128,6,,True,1
17038,15850,2024-09-01 18:03:10,4813,106,19,1753602251163,4305669889,6,,True,1
65080,15850,2024-09-06 07:13:25,4675,27,1144483,1891517757037,4293355912,6,8055083000000.0,True,2


In [6]:
buyer_stream

Unnamed: 0,user_id,event_date,eid,category_id,microcat_id,internal_item_id,item_id,user_hash,x,conctact,session_id
50313,15850,2024-09-01 09:19:06,4813,29,2179585,1880802250341,4126988312,6,,True,1
50314,15850,2024-09-01 13:49:05,4813,29,2179585,1880802250341,4126988312,6,,True,1
54366,15850,2024-09-01 16:56:45,4813,29,2179579,1881675250689,4208903128,6,,True,1
17038,15850,2024-09-01 18:03:10,4813,106,19,1753602251163,4305669889,6,,True,1
65080,15850,2024-09-06 07:13:25,4675,27,1144483,1891517757037,4293355912,6,8.055083e+12,True,2
...,...,...,...,...,...,...,...,...,...,...,...
65079,1702546250012,2024-10-16 23:38:16,857,101,3841,1930517250112,4359474466,39,8.264518e+12,True,19850
11942,1702546250012,2024-10-16 23:39:28,857,27,1178044,1930266250050,4168550717,39,8.264513e+12,True,19850
1339,1702546250012,2024-10-16 23:43:17,857,9,21777,1669856001035,3792232410,39,8.264666e+12,True,19850
40876,1702546250012,2024-10-16 23:46:00,857,9,21753,1912986503800,4357178651,39,,True,19850


In [9]:
X = prepare_df_min_len_count(
    buyer_stream, min_len_session=MIN_LEN_SESSION, min_count_item=MIN_COUNT_ITEM
)

Итерация 0: 42438 записей


In [10]:
X

Unnamed: 0,session_id,item_id
0,1,"{4126988312, 4305669889, 4208903128}"
1,3,"{4343722368, 2828827745, 4293355912, 435552692..."
2,4,"{2408812176, 4293355912}"
3,12,"{4307926362, 3880933091, 3881465182}"
4,13,"{4227138946, 3802442791}"
...,...,...
9703,19841,"{4151373521, 2734374846}"
9704,19842,"{3958391463, 3911734122, 4024406062, 370951624..."
9705,19847,"{4536092264, 3862734537, 3831233703}"
9706,19849,"{4464294056, 4384518868}"


In [11]:
# соберем каждой микрокатегории по одному самому популярному товару (здесь можно дорабатывать логику того, какой товар показывать из микрокатегории)
item_counts = (
    buyer_stream.groupby(["microcat_id", "item_id"]).size().reset_index(name="count")
)

most_frequent_items = (
    item_counts.sort_values(["microcat_id", "count"], ascending=[True, False])
    .drop_duplicates("microcat_id")
    .drop("count", axis=1)
)

microcat_to_popular_item = dict(
    zip(most_frequent_items["microcat_id"], most_frequent_items["item_id"])
)

In [12]:
# каждому товару мапим микрокатегорию
needed_info = buyer_stream.drop_duplicates(subset=["item_id"])
item_to_microcat = dict(zip(needed_info["item_id"], needed_info["microcat_id"]))

In [13]:
X["microcats"] = X.item_id.apply(lambda x: [item_to_microcat[item] for item in x])
X_train, X_val = train_test_split(X, test_size=0.1, random_state=42)

In [15]:
micro_cooccur = defaultdict(lambda: defaultdict(int))

for session in X_train.microcats.tolist():
    micros = set(session)
    for i, j in combinations(micros, 2):
        micro_cooccur[i][j] += 1
        micro_cooccur[j][i] += 1

In [16]:
def get_baseline_recommendations(microcat, top_n=5):
    related = micro_cooccur[microcat]
    return sorted(related.items(), key=lambda x: -x[1])[:top_n]


model_answers = []
for session in X_val.microcats.tolist():
    main_cat = session[1]
    predictions = get_baseline_recommendations(main_cat)
    final_answer = [
        microcat_to_popular_item[prediction[0]] for prediction in predictions
    ]
    model_answers.append(final_answer)

In [18]:
K = 10

val_sessions = list(map(lambda x: list(map(str, x)), X_val.item_id.tolist()))
true_rec = list(map(lambda x: x[-(K + 1) : -1], val_sessions))

In [19]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def precision_at_k(y_true_list, y_pred_matrix, k: int) -> float:
    precisions = []
    for i in range(len(y_true_list)):
        y_true = set(y_true_list[i])
        y_pred = set(y_pred_matrix[i][:k])
        intersection = len(y_true & y_pred)
        precisions.append(intersection / k)

    return float(np.mean(precisions))


def recall_at_k(y_true_list, y_pred_matrix, k: int) -> float:
    recalls = []
    for i in range(len(y_true_list)):
        y_true = set(y_true_list[i])
        if not y_true:
            continue
        y_pred = set(y_pred_matrix[i][:k])
        intersection = len(y_true & y_pred)
        recalls.append(intersection / len(y_true))

    return float(np.mean(recalls)) if recalls else 0.0


def ndcg_at_k(y_true_list, y_pred_matrix, k: int, relevance_scores=None) -> float:
    ndcgs = []
    for i in range(len(y_true_list)):
        if relevance_scores is None:
            y_true = set(y_true_list[i])
            rel = np.array(
                [1 if item in y_true else 0 for item in y_pred_matrix[i][:k]]
            )
        else:
            rel = np.array(
                [relevance_scores[i].get(item, 0) for item in y_pred_matrix[i][:k]]
            )

        discounts = np.log2(np.arange(2, k + 2))
        dcg = np.sum(rel / discounts)

        ideal_rel = np.sort(rel)[::-1]
        idcg = np.sum(ideal_rel / discounts)

        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)

    return np.mean(ndcgs) if ndcgs else 0.0


def diversity_at_k(recommendations_embeddings, k=10) -> float:
    top_k_embeddings = recommendations_embeddings[:, :k, :]

    similarities = []
    for user_recs in top_k_embeddings:
        sim_matrix = cosine_similarity(user_recs)
        upper_tri = sim_matrix[np.triu_indices(k, 1)]
        similarities.extend(upper_tri)

    avg_similarity = np.mean(similarities)

    return 1 - avg_similarity


def common_metrics(y_true_list, y_pred_matrix, k: int, relevance_scores=None) -> str:
    precision = precision_at_k(y_true_list, y_pred_matrix, k=k)
    recall = recall_at_k(y_true_list, y_pred_matrix, k=k)
    # ndcg = ndcg_at_k(y_true_list, y_pred_matrix, k=k, relevance_scores=relevance_scores)

    return f"precision: {precision}\nrecall: {recall}\n"


print(common_metrics(true_rec, model_answers, k=K))

precision: 0.0
recall: 0.0

