In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import time

In [2]:
from google.colab import files
uploaded = files.upload()

Saving data_thesis.csv to data_thesis.csv


In [3]:
df = pd.read_csv('data_thesis.csv', parse_dates=['TRANSACTION_DT'])
df['MONTH'] = df['TRANSACTION_DT'].dt.to_period('M')

def filter_data(df):
    user_counts = df['CUSTOMER_ID'].value_counts()
    item_counts = df['PRODUCT_ID'].value_counts()

    df_filtered = df[df['CUSTOMER_ID'].isin(user_counts[user_counts >= 10].index)]
    df_filtered = df_filtered[df_filtered['PRODUCT_ID'].isin(item_counts[item_counts >= 10].index)]

    user_monthly_counts = df_filtered.groupby(['CUSTOMER_ID', 'MONTH']).size().groupby('CUSTOMER_ID').mean()
    valid_users = user_monthly_counts[user_monthly_counts >= 1].index
    df_filtered = df_filtered[df_filtered['CUSTOMER_ID'].isin(valid_users)]

    return df_filtered

prev_shape = None
current_df = df.copy()
while prev_shape != current_df.shape:
    prev_shape = current_df.shape
    current_df = filter_data(current_df)
current_df = current_df.drop(columns=['TRANSACTION_DT', 'ASSET'])

In [4]:
current_df

Unnamed: 0,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,SALES_PRICE,MONTH
0,1104905,45-49,115,110411,4710199010372,2,30,2000-11
1,418683,45-49,115,120107,4710857472535,1,46,2000-11
2,1057331,35-39,115,100407,4710043654103,2,166,2000-11
3,1849332,45-49,Others,120108,4710126092129,1,38,2000-11
4,1981995,50-54,115,100205,4710176021445,1,18,2000-11
...,...,...,...,...,...,...,...,...
817734,234658,45-49,Unknown,530104,4710168182031,1,149,2001-02
817735,556941,35-39,115,712901,8888021800401,1,150,2001-02
817737,57486,40-44,115,530209,4710731060124,1,55,2001-02
817738,733526,>65,Unknown,510539,4716340052307,1,115,2001-02


In [5]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

current_df['user_id'] = user_encoder.fit_transform(current_df['CUSTOMER_ID'])
current_df['item_id'] = item_encoder.fit_transform(current_df['PRODUCT_ID'])

num_users = current_df['user_id'].nunique()
num_items = current_df['item_id'].nunique()

In [6]:
months = sorted(current_df['MONTH'].unique())
train_months = months[:-1]
test_month = months[-1]

train_df = current_df[current_df['MONTH'].isin(train_months)]
test_df = current_df[current_df['MONTH'] == test_month]

train_user_items = train_df.groupby('user_id')['item_id'].apply(set).to_dict()
test_user_items = test_df.groupby('user_id')['item_id'].apply(set).to_dict()

In [7]:
class NCFDataset(Dataset):
    def __init__(self, user_item_dict, num_items, negative_samples=4):
        self.user_item_dict = user_item_dict
        self.users = list(user_item_dict.keys())
        self.num_items = num_items
        self.negative_samples = negative_samples
        self.data = []
        self.prepare()

    def prepare(self):
        for user in self.users:
            pos_items = self.user_item_dict[user]
            for pos_item in pos_items:
                self.data.append((user, pos_item, 1))
                for _ in range(self.negative_samples):
                    neg_item = np.random.randint(self.num_items)
                    while neg_item in pos_items:
                        neg_item = np.random.randint(self.num_items)
                    self.data.append((user, neg_item, 0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_dataset = NCFDataset(train_user_items, num_items)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=2)

In [8]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=32, hidden_layers=[64,32,16,8]):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)

        layers = []
        input_size = emb_size * 2
        for h in hidden_layers:
            layers.append(nn.Linear(input_size, h))
            layers.append(nn.ReLU())
            input_size = h
        layers.append(nn.Linear(input_size, 1))
        self.mlp = nn.Sequential(*layers)
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, item):
        u = self.user_emb(user)
        i = self.item_emb(item)
        x = torch.cat([u, i], dim=-1)
        x = self.mlp(x)
        return self.sigmoid(x).squeeze()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users, num_items).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [9]:
epochs = 5
model.train()
start_time = time.time()

for epoch in range(epochs):
    total_loss = 0
    for user, item, label in train_loader:
        user = user.to(device)
        item = item.to(device)
        label = label.float().to(device)
        optimizer.zero_grad()
        preds = model(user, item)
        loss = criterion(preds, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * user.size(0)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader.dataset):.4f}")

train_time = time.time() - start_time
print(f"Training time: {train_time:.2f} sec")

Epoch 1/5, Loss: 0.4420
Epoch 2/5, Loss: 0.4021
Epoch 3/5, Loss: 0.3988
Epoch 4/5, Loss: 0.3967
Epoch 5/5, Loss: 0.3942
Training time: 176.77 sec


In [10]:
model.eval()

def get_user_recommendations(model, user, train_items, all_items, top_k):
    candidates = list(set(all_items) - train_items)
    user_tensor = torch.tensor([user]*len(candidates), dtype=torch.long).to(device)
    item_tensor = torch.tensor(candidates, dtype=torch.long).to(device)
    with torch.no_grad():
        scores = model(user_tensor, item_tensor).cpu().numpy()
    top_indices = np.argpartition(-scores, top_k)[:top_k]
    top_items = [candidates[i] for i in top_indices[np.argsort(-scores[top_indices])]]
    return top_items

def recall_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & relevant)
    return hits / len(relevant) if relevant else 0

def dcg_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    dcg = 0
    for i, item in enumerate(recommended_k):
        if item in relevant:
            dcg += 1 / np.log2(i+2)
    return dcg

def idcg_at_k(relevant, k):
    n = min(len(relevant), k)
    return sum(1 / np.log2(i+2) for i in range(n))

def ndcg_at_k(recommended, relevant, k):
    idcg = idcg_at_k(relevant, k)
    if idcg == 0:
        return 0
    return dcg_at_k(recommended, relevant, k) / idcg

def average_precision_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    hits = 0
    sum_precisions = 0
    for i, item in enumerate(recommended_k, 1):
        if item in relevant:
            hits += 1
            sum_precisions += hits / i
    return sum_precisions / min(len(relevant), k) if relevant else 0

def mrr_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    for i, item in enumerate(recommended_k, 1):
        if item in relevant:
            return 1.0 / i
    return 0.0

def hit_rate_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    return 1.0 if len(set(recommended_k) & relevant) > 0 else 0.0

ks = [10, 20, 50]

all_items = set(range(num_items))
users_in_test = set(test_user_items.keys())

metrics = {k: {'recall': [], 'ndcg': [], 'map': [], 'mrr': [], 'hitrate': []} for k in ks}

for user in users_in_test:
    train_items = train_user_items.get(user, set())
    relevant = test_user_items[user]
    recommended = get_user_recommendations(model, user, train_items, all_items, max(ks))
    for k in ks:
        metrics[k]['recall'].append(recall_at_k(recommended, relevant, k))
        metrics[k]['ndcg'].append(ndcg_at_k(recommended, relevant, k))
        metrics[k]['map'].append(average_precision_at_k(recommended, relevant, k))
        metrics[k]['mrr'].append(mrr_at_k(recommended, relevant, k))
        metrics[k]['hitrate'].append(hit_rate_at_k(recommended, relevant, k))

print("\nEvaluation metrics:")
for k in ks:
    recall = np.mean(metrics[k]['recall'])
    ndcg = np.mean(metrics[k]['ndcg'])
    map_ = np.mean(metrics[k]['map'])
    mrr = np.mean(metrics[k]['mrr'])
    hitrate = np.mean(metrics[k]['hitrate'])
    print(f"Recall@{k}: {recall:.4f}")
    print(f"NDCG@{k}: {ndcg:.4f}")
    print(f"MAP@{k}: {map_:.4f}")
    print(f"MRR@{k}: {mrr:.4f}")
    print(f"HitRate@{k}: {hitrate:.4f}")


Evaluation metrics:
Recall@10: 0.0290
NDCG@10: 0.0365
MAP@10: 0.0147
MRR@10: 0.0888
HitRate@10: 0.2405
Recall@20: 0.0447
NDCG@20: 0.0396
MAP@20: 0.0133
MRR@20: 0.0962
HitRate@20: 0.3491
Recall@50: 0.0770
NDCG@50: 0.0516
MAP@50: 0.0143
MRR@50: 0.1013
HitRate@50: 0.5086


Recall@10 = 2.9%, @20 = 4.5%, @50 = 7.7%: очень низкий показатель полноты.

NDCG@K около 0.03–0.05: релевантные товары расположены далеко в списке или отсутствуют.

MAP около 0.014: средняя точность низкая, что дополнительно подтверждает, что релевантные рекомендации либо редки, либо распределены по низким позициям.

MRR около 0.09–0.10: в среднем первый релевантный товар находится примерно на 10-й позиции.

HitRate от 24%  до 51%: модель угадывает хотя бы один релевантный товар у примерно четверти пользователей в топ-10 и у половины — в топ-50.