In [1]:
import numpy as np

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
import torch.nn as nn

In [4]:
import torch.optim as optim

In [5]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving data_thesis.csv to data_thesis.csv


In [6]:
import time

In [7]:
df = pd.read_csv('data_thesis.csv', parse_dates=['TRANSACTION_DT'])

df['MONTH'] = df['TRANSACTION_DT'].dt.to_period('M')

def filter_data(df):
    user_counts = df['CUSTOMER_ID'].value_counts()
    item_counts = df['PRODUCT_ID'].value_counts()

    df_filtered = df[df['CUSTOMER_ID'].isin(user_counts[user_counts >= 10].index)]
    df_filtered = df_filtered[df_filtered['PRODUCT_ID'].isin(item_counts[item_counts >= 10].index)]

    total_months = df_filtered['MONTH'].nunique()

    user_monthly_counts = df_filtered.groupby(['CUSTOMER_ID', 'MONTH']).size().groupby('CUSTOMER_ID').mean()

    valid_users = user_monthly_counts[user_monthly_counts >= 1].index
    df_filtered = df_filtered[df_filtered['CUSTOMER_ID'].isin(valid_users)]

    return df_filtered

prev_shape = None
current_df = df.copy()

while prev_shape != current_df.shape:
    prev_shape = current_df.shape
    current_df = filter_data(current_df)

current_df = current_df.drop(columns=['TRANSACTION_DT', 'ASSET'])
current_df

Unnamed: 0,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,SALES_PRICE,MONTH
0,1104905,45-49,115,110411,4710199010372,2,30,2000-11
1,418683,45-49,115,120107,4710857472535,1,46,2000-11
2,1057331,35-39,115,100407,4710043654103,2,166,2000-11
3,1849332,45-49,Others,120108,4710126092129,1,38,2000-11
4,1981995,50-54,115,100205,4710176021445,1,18,2000-11
...,...,...,...,...,...,...,...,...
817734,234658,45-49,Unknown,530104,4710168182031,1,149,2001-02
817735,556941,35-39,115,712901,8888021800401,1,150,2001-02
817737,57486,40-44,115,530209,4710731060124,1,55,2001-02
817738,733526,>65,Unknown,510539,4716340052307,1,115,2001-02


In [8]:
for col in current_df.columns:
    unique_count = current_df[col].nunique()
    print(f"Column '{col}': {unique_count} unique values")

Column 'CUSTOMER_ID': 19745 unique values
Column 'AGE_GROUP': 10 unique values
Column 'PIN_CODE': 8 unique values
Column 'PRODUCT_SUBCLASS': 1342 unique values
Column 'PRODUCT_ID': 10721 unique values
Column 'AMOUNT': 80 unique values
Column 'SALES_PRICE': 1879 unique values
Column 'MONTH': 4 unique values


In [9]:
target_month = pd.Period('2001-02', freq='M')
top_n = 50

In [10]:
history_df = current_df[current_df['MONTH'] < target_month]
target_df = current_df[current_df['MONTH'] == target_month]
history_df = history_df.sort_values(['CUSTOMER_ID', 'MONTH'])

def get_last_n_interactions(df, n=100):
    user_sequences = df.groupby('CUSTOMER_ID')['PRODUCT_ID'].apply(lambda x: list(x)[-n:])
    return user_sequences

user_histories = get_last_n_interactions(history_df, n=100)

user_targets = target_df.groupby('CUSTOMER_ID')['PRODUCT_ID'].apply(set)

valid_users = user_histories.index.intersection(user_targets.index)
user_histories = user_histories.loc[valid_users]
user_targets = user_targets.loc[valid_users]

In [11]:
unique_items = current_df['PRODUCT_ID'].unique()
item2idx = {item: idx for idx, item in enumerate(unique_items)}

def encode_sequence(seq):
    return [item2idx[i] for i in seq if i in item2idx]

user_histories_idx = user_histories.apply(encode_sequence)
user_targets_idx = user_targets.apply(lambda items: set(item2idx[i] for i in items if i in item2idx))

In [12]:
class NextMonthDataset(Dataset):
    def __init__(self, user_histories, user_targets):
        self.users = list(user_histories.index)
        self.histories = user_histories
        self.targets = user_targets

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        seq = self.histories[user]
        target = self.targets[user]
        return torch.tensor(seq, dtype=torch.long), torch.tensor(list(target), dtype=torch.long)

def collate_fn(batch):
    sequences, targets = zip(*batch)
    max_len = max(len(seq) for seq in sequences)

    padded_seqs = [torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)]) for seq in sequences]
    padded_seqs = torch.stack(padded_seqs)

    batch_size = len(targets)
    num_items = len(item2idx)
    target_multi_hot = torch.zeros(batch_size, num_items)
    for i, t in enumerate(targets):
        target_multi_hot[i, t] = 1

    return padded_seqs, target_multi_hot

dataset = NextMonthDataset(user_histories_idx, user_targets_idx)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

In [13]:
class EMAE(nn.Module):
    def __init__(self, num_items, embed_dim=64, memory_size=1000):
        super().__init__()
        self.item_embedding = nn.Embedding(num_items, embed_dim, padding_idx=0)
        self.encoder = nn.GRU(embed_dim, embed_dim, batch_first=True)
        self.memory = nn.Parameter(torch.randn(memory_size, embed_dim))
        self.linear = nn.Linear(embed_dim * 2, num_items)

    def forward(self, x):
        emb = self.item_embedding(x)
        _, h = self.encoder(emb)
        h = h.squeeze(0)

        mem = self.memory.mean(dim=0)
        mem = mem.unsqueeze(0).expand(h.size(0), -1)

        combined = torch.cat([h, mem], dim=1)
        logits = self.linear(combined)
        return logits

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EMAE(num_items=len(item2idx)).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

epochs = 10

start = time.time()

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        logits = model(inputs)
        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

Epoch 1, Loss: 0.1088
Epoch 2, Loss: 0.0094
Epoch 3, Loss: 0.0084
Epoch 4, Loss: 0.0081
Epoch 5, Loss: 0.0080
Epoch 6, Loss: 0.0080
Epoch 7, Loss: 0.0080
Epoch 8, Loss: 0.0080
Epoch 9, Loss: 0.0080
Epoch 10, Loss: 0.0080


In [15]:
end = time.time()
print(f"Время выполнения: {(end - start):.2f} секунд")

Время выполнения: 163.48 секунд


In [16]:
model.eval()
with torch.no_grad():
    logits = model(inputs.to(device))
    probs = torch.sigmoid(logits)
    top_n = 50
    top_preds = torch.topk(probs, top_n, dim=1).indices.cpu().numpy()

In [17]:
def recall_at_k(pred_items, true_items, k):
    pred_k = pred_items[:k]
    hits = len(set(pred_k) & true_items)
    recall = hits / len(true_items) if len(true_items) > 0 else 0.0
    return recall

def dcg_at_k(pred_items, true_items, k):
    dcg = 0.0
    for i, p in enumerate(pred_items[:k]):
        if p in true_items:
            dcg += 1 / np.log2(i + 2)
    return dcg

def idcg_at_k(true_items, k):
    n_relevant = min(len(true_items), k)
    idcg = sum(1 / np.log2(i + 2) for i in range(n_relevant))
    return idcg

def ndcg_at_k(pred_items, true_items, k):
    idcg = idcg_at_k(true_items, k)
    if idcg == 0:
        return 0.0
    dcg = dcg_at_k(pred_items, true_items, k)
    return dcg / idcg

def average_precision_at_k(pred_items, true_items, k):
    hits = 0
    sum_precisions = 0.0
    for i, p in enumerate(pred_items[:k]):
        if p in true_items:
            hits += 1
            precision_at_i = hits / (i + 1)
            sum_precisions += precision_at_i
    if hits == 0:
        return 0.0
    return sum_precisions / hits

def mean_reciprocal_rank_at_k(pred_items, true_items, k):
    for rank, p in enumerate(pred_items[:k], start=1):
        if p in true_items:
            return 1.0 / rank
    return 0.0

def hit_rate_at_k(pred_items, true_items, k):
    pred_k = pred_items[:k]
    return int(any(p in true_items for p in pred_k))

def evaluate_batch(preds, targets, top_k):
    batch_size = preds.shape[0]
    recall_list = []
    ndcg_list = []
    map_list = []
    mrr_list = []
    hitrate_list = []

    for i in range(batch_size):
        true_items = set(np.where(targets[i] == 1)[0])
        if len(true_items) == 0:
            continue

        top_pred_items = preds[i].argsort()[::-1][:top_k]

        recall = recall_at_k(top_pred_items, true_items, top_k)
        ndcg = ndcg_at_k(top_pred_items, true_items, top_k)
        ap = average_precision_at_k(top_pred_items, true_items, top_k)
        mrr = mean_reciprocal_rank_at_k(top_pred_items, true_items, top_k)
        hit = hit_rate_at_k(top_pred_items, true_items, top_k)

        recall_list.append(recall)
        ndcg_list.append(ndcg)
        map_list.append(ap)
        mrr_list.append(mrr)
        hitrate_list.append(hit)

    avg_recall = np.mean(recall_list) if recall_list else 0.0
    avg_ndcg = np.mean(ndcg_list) if ndcg_list else 0.0
    avg_map = np.mean(map_list) if map_list else 0.0
    avg_mrr = np.mean(mrr_list) if mrr_list else 0.0
    avg_hit = np.mean(hitrate_list) if hitrate_list else 0.0

    return avg_recall, avg_ndcg, avg_map, avg_mrr, avg_hit


model.eval()

for K in [10, 20, 50]:
    all_recalls, all_ndcgs, all_maps = [], [], []
    all_mrrs, all_hits = [], []

    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            logits = model(inputs)
            probs = torch.sigmoid(logits).cpu().numpy()
            targets_np = targets.cpu().numpy()

            recall, ndcg, map_score, mrr, hit = evaluate_batch(probs, targets_np, top_k=K)

            all_recalls.append(recall)
            all_ndcgs.append(ndcg)
            all_maps.append(map_score)
            all_mrrs.append(mrr)
            all_hits.append(hit)

    print(f"Metrics @ {K}:")
    print(f"  Recall: {np.mean(all_recalls):.4f}")
    print(f"  NDCG:   {np.mean(all_ndcgs):.4f}")
    print(f"  MAP:    {np.mean(all_maps):.4f}")
    print(f"  MRR:    {np.mean(all_mrrs):.4f}")
    print(f"  HitRate:{np.mean(all_hits):.4f}")
    print()

Metrics @ 10:
  Recall: 0.0731
  NDCG:   0.1108
  MAP:    0.2546
  MRR:    0.2847
  HitRate:0.4430

Metrics @ 20:
  Recall: 0.1060
  NDCG:   0.1130
  MAP:    0.2381
  MRR:    0.2921
  HitRate:0.5558

Metrics @ 50:
  Recall: 0.1730
  NDCG:   0.1365
  MAP:    0.1970
  MRR:    0.2972
  HitRate:0.7053



Recall@K = 7–17% — достаточно низкий, но типичный результат для сложных задач с большим количеством товаров и очень разреженными данными.

NDCG@K около 0.11–0.14 — указывает, что релевантные товары не всегда находятся в самых верхних позициях списка, ранжирование можно улучшить.

MAP@K от 0.25 до 0.20 — средняя точность по релевантным элементам умеренная, показывает, что релевантные товары разбросаны по списку.

MRR около 0.28–0.30 — в среднем первый релевантный товар находится примерно в позиции 3–4 (т.к. MRR = 1/позиция).

Hit Rate 44–70% — модель покрывает релевантные товары для примерно половины пользователей в топ-10 и до 70% в топ-50. Модель часто угадывает хотя бы один релевантный товар.

