In [1]:
import time

In [3]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [4]:
import pandas as pd
from google.colab import files
from scipy import sparse
from implicit.als import AlternatingLeastSquares
import numpy as np
uploaded = files.upload()

Saving data_thesis.csv to data_thesis.csv


In [5]:
df = pd.read_csv('data_thesis.csv', parse_dates=['TRANSACTION_DT'])

df['MONTH'] = df['TRANSACTION_DT'].dt.to_period('M')

def filter_data(df):
    user_counts = df['CUSTOMER_ID'].value_counts()
    item_counts = df['PRODUCT_ID'].value_counts()

    df_filtered = df[df['CUSTOMER_ID'].isin(user_counts[user_counts >= 10].index)]
    df_filtered = df_filtered[df_filtered['PRODUCT_ID'].isin(item_counts[item_counts >= 10].index)]

    total_months = df_filtered['MONTH'].nunique()

    user_monthly_counts = df_filtered.groupby(['CUSTOMER_ID', 'MONTH']).size().groupby('CUSTOMER_ID').mean()

    valid_users = user_monthly_counts[user_monthly_counts >= 1].index
    df_filtered = df_filtered[df_filtered['CUSTOMER_ID'].isin(valid_users)]

    return df_filtered

prev_shape = None
current_df = df.copy()

while prev_shape != current_df.shape:
    prev_shape = current_df.shape
    current_df = filter_data(current_df)

current_df = current_df.drop(columns=['TRANSACTION_DT', 'ASSET'])
current_df

Unnamed: 0,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,SALES_PRICE,MONTH
0,1104905,45-49,115,110411,4710199010372,2,30,2000-11
1,418683,45-49,115,120107,4710857472535,1,46,2000-11
2,1057331,35-39,115,100407,4710043654103,2,166,2000-11
3,1849332,45-49,Others,120108,4710126092129,1,38,2000-11
4,1981995,50-54,115,100205,4710176021445,1,18,2000-11
...,...,...,...,...,...,...,...,...
817734,234658,45-49,Unknown,530104,4710168182031,1,149,2001-02
817735,556941,35-39,115,712901,8888021800401,1,150,2001-02
817737,57486,40-44,115,530209,4710731060124,1,55,2001-02
817738,733526,>65,Unknown,510539,4716340052307,1,115,2001-02


In [6]:
for col in current_df.columns:
    unique_count = current_df[col].nunique()
    print(f"Column '{col}': {unique_count} unique values")

Column 'CUSTOMER_ID': 19745 unique values
Column 'AGE_GROUP': 10 unique values
Column 'PIN_CODE': 8 unique values
Column 'PRODUCT_SUBCLASS': 1342 unique values
Column 'PRODUCT_ID': 10721 unique values
Column 'AMOUNT': 80 unique values
Column 'SALES_PRICE': 1879 unique values
Column 'MONTH': 4 unique values


In [7]:
target_month = pd.Period('2001-02', freq='M')
top_n = 50

In [8]:
history_df = current_df[current_df['MONTH'] < target_month]
target_df = current_df[current_df['MONTH'] == target_month]

all_users = current_df['CUSTOMER_ID'].unique()
all_items = current_df['PRODUCT_ID'].unique()

user2idx = {u: i for i, u in enumerate(all_users)}
item2idx = {i: j for j, i in enumerate(all_items)}

idx2user = {v: k for k, v in user2idx.items()}
idx2item = {v: k for k, v in item2idx.items()}

In [9]:
def build_interaction_matrix(df):
    rows = df['PRODUCT_ID'].map(item2idx)
    cols = df['CUSTOMER_ID'].map(user2idx)
    data = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sparse.coo_matrix((data, (rows, cols)),
                                           shape=(len(all_items), len(all_users)))
    return interaction_matrix.tocsr()

history_matrix = build_interaction_matrix(history_df)

In [10]:
model = AlternatingLeastSquares(factors=64, regularization=0.1, iterations=20, use_gpu=False)
start = time.time()
model.fit(history_matrix)

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
end = time.time()
print(f"Время выполнения: {(end - start):.2f} секунд")

Время выполнения: 8.07 секунд


In [12]:
target_matrix = build_interaction_matrix(target_df)

def get_user_true_items(matrix):
    user_true = {}
    for user_idx in range(matrix.shape[0]):
        user_true[user_idx] = set(matrix[user_idx].indices)
    return user_true

user_true_items = get_user_true_items(target_matrix)

In [13]:
def recommend_for_user(user_idx, N=top_n):
    user_items = history_matrix.T[user_idx, :].tocsr()
    recs = model.recommend(user_idx, user_items, N=N, filter_already_liked_items=True)
    return [x[0] for x in recs]

user_recs = {}
for user_idx in user_true_items:
    rec_items = recommend_for_user(user_idx)
    user_recs[user_idx] = rec_items

In [15]:
def recall_at_k(pred_items, true_items, k):
    pred_k = pred_items[:k]
    hits = len(set(pred_k) & true_items)
    if len(true_items) == 0:
        return 0.0
    return hits / len(true_items)

def dcg_at_k(pred_items, true_items, k):
    pred_k = pred_items[:k]
    dcg = 0.0
    for i, item in enumerate(pred_k):
        if item in true_items:
            dcg += 1.0 / np.log2(i + 2)  # позиция с 1, поэтому +2 в логарифме
    return dcg

def idcg_at_k(true_items, k):
    # Меньшая из длины релевантных элементов и k — идеальный релевантный порядок
    max_rel = min(len(true_items), k)
    idcg = 0.0
    for i in range(max_rel):
        idcg += 1.0 / np.log2(i + 2)
    return idcg

def ndcg_at_k(pred_items, true_items, k):
    idcg = idcg_at_k(true_items, k)
    if idcg == 0:
        return 0.0
    dcg = dcg_at_k(pred_items, true_items, k)
    return dcg / idcg

def average_precision_at_k(pred_items, true_items, k):
    pred_k = pred_items[:k]
    hits = 0
    sum_precisions = 0.0
    for i, item in enumerate(pred_k, start=1):
        if item in true_items:
            hits += 1
            sum_precisions += hits / i
    if hits == 0:
        return 0.0
    return sum_precisions / hits

def mean_reciprocal_rank(pred_items, true_items):
    for rank, item in enumerate(pred_items, start=1):
        if item in true_items:
            return 1.0 / rank
    return 0.0

def hit_rate(pred_items, true_items):
    return int(any(item in true_items for item in pred_items))

In [16]:
def evaluate_recommendations(user_recs, user_true_items, top_k):
    recalls = []
    ndcgs = []
    maps = []
    mrrs = []
    hits = []

    for user_idx, true_items in user_true_items.items():
        pred_items = user_recs.get(user_idx, [])[:top_k]

        recall = recall_at_k(pred_items, true_items, top_k)
        ndcg = ndcg_at_k(pred_items, true_items, top_k)
        ap = average_precision_at_k(pred_items, true_items, top_k)
        mrr = mean_reciprocal_rank(pred_items, true_items)
        hit = hit_rate(pred_items, true_items)

        recalls.append(recall)
        ndcgs.append(ndcg)
        maps.append(ap)
        mrrs.append(mrr)
        hits.append(hit)

    print(f"Recall@{top_k}: {np.mean(recalls):.4f}")
    print(f"NDCG@{top_k}: {np.mean(ndcgs):.4f}")
    print(f"MAP@{top_k}: {np.mean(maps):.4f}")
    print(f"MRR@{top_k}: {np.mean(mrrs):.4f}")
    print(f"HitRate@{top_k}: {np.mean(hits):.4f}")

evaluate_recommendations(user_recs, user_true_items, top_k=10)
evaluate_recommendations(user_recs, user_true_items, top_k=20)
evaluate_recommendations(user_recs, user_true_items, top_k=50)

Recall@10: 0.0033
NDCG@10: 0.0083
MAP@10: 0.0302
MRR@10: 0.0302
HitRate@10: 0.0302
Recall@20: 0.0033
NDCG@20: 0.0070
MAP@20: 0.0302
MRR@20: 0.0302
HitRate@20: 0.0302
Recall@50: 0.0033
NDCG@50: 0.0063
MAP@50: 0.0302
MRR@50: 0.0302
HitRate@50: 0.0302


Recall@K ≈ 0.0033 — из всех релевантных товаров в среднем рекомендовано только 0.33%.

NDCG@K < 0.01 — качество ранжирования очень низкое.

MAP@K, MRR@K, HitRate@K ≈ 0.0302 — вероятность встретить релевантный товар в рекомендациях около 3%, а средний ранг первого релевантного элемента очень низкий.