In [1]:
import os
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1" 
os.environ["OMP_NUM_THREADS"] = "1"

import threadpoolctl
threadpoolctl.threadpool_limits(1, "blas")

import implicit
import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm
from lightfm import LightFM
from abc import ABC, abstractmethod
from typing import Dict, List
from collections import defaultdict, Counter 

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from lightfm.data import Dataset as LightFMDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Сразу загружу данные
train = pl.read_parquet("../../data/train.pq")
test = pl.read_parquet("../../data/test.pq")

train_items = set(train["item_id"].unique())
test_items = set(test["item_id"].unique())
cold_items = test_items - train_items

### Метрики оценивания моделей

In [3]:
class Validator(ABC):
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame, cold_items: set = None):
        self.train = train
        self.test = test
        self.cold_items = cold_items or set()

    @abstractmethod
    def evaluate(self, predictions: Dict[int, List[int]]) -> Dict[str, float]:
        """
        predictions: dict user_id -> list of recommended item_ids
        """
        pass

    def recall_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        return len(set(y_true) & set(y_pred[:k])) / len(set(y_true)) if y_true else 0.0

    def precision_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        return len(set(y_true) & set(y_pred[:k])) / k if y_true else 0.0

    def hitrate_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        return 1.0 if len(set(y_true) & set(y_pred[:k])) > 0 else 0.0

    def ndcg_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        dcg = 0.0
        for i, item in enumerate(y_pred[:k]):
            if item in y_true:
                dcg += 1 / np.log2(i + 2)
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(y_true), k)))
        return dcg / idcg if idcg > 0 else 0.0

    def mrr_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        for i, item in enumerate(y_pred[:k]):
            if item in y_true:
                return 1 / (i + 1)
        return 0.0

    def coverage(self, predictions: Dict[int, List[int]]) -> float:
        all_pred_items = set(item for recs in predictions.values() for item in recs)
        all_train_items = set(self.train["item_id"].unique())
        return len(all_pred_items) / len(all_train_items)

    @staticmethod
    def print_metrics(metrics: Dict[str, float]):
        print("\n=== Evaluation Results ===")
        for key, value in metrics.items():
            print(f"{key:<15}: {value:.4f}")
        print("==========================\n")

Стоит разделить валидацию на две версии, для сравнения. Моя гипотеза заключается в том, что совместная валидация warm и cold может быть не совсем честной. Например, если в тесте 90% warm и 10% cold, то Recall@10 в среднем будет определяться warm-айтемами. Модель может полностью «забыть» про cold items, но в отчёте всё равно будут хорошие цифры. Это вводит в заблуждение: кажется, что модель универсальная, хотя на самом деле cold-start не решён.

#### Блок совместной валидации (baseline)

In [4]:
class JointValidator(Validator):
    def __init__(self, train: pl.DataFrame, test: pl.DataFrame, cold_items: set = None):
        super().__init__(train, test, cold_items)
        self.user2items = (
            test.group_by("user_id").agg(pl.col("item_id")).to_dict(as_series=False)
        )
        self.user2items = dict(zip(self.user2items["user_id"], self.user2items["item_id"]))

    def evaluate(self, predictions: Dict[int, List[int]]) -> Dict[str, float]:
        recalls, precisions, hits, ndcgs, mrrs = [], [], [], [], []
        for user_id, y_pred in predictions.items():
            y_true = self.user2items.get(user_id, [])
            recalls.append(self.recall_at_k(y_true, y_pred))
            precisions.append(self.precision_at_k(y_true, y_pred))
            hits.append(self.hitrate_at_k(y_true, y_pred))
            ndcgs.append(self.ndcg_at_k(y_true, y_pred))
            mrrs.append(self.mrr_at_k(y_true, y_pred))
        results = {
            "Recall@10": np.mean(recalls),
            "Precision@10": np.mean(precisions),
            "HitRate@10": np.mean(hits),
            "NDCG@10": np.mean(ndcgs),
            "MRR@10": np.mean(mrrs),
            "Coverage": self.coverage(predictions),
        }
        self.print_metrics(results)

#### Разделенная валидация (cold vs warm)

In [5]:
class SplitValidator(Validator):
    def __init__(self, train: pl.DataFrame, test: pl.DataFrame, cold_items: set = None):
        super().__init__(train, test, cold_items)
        self.user2items = (
            test.group_by("user_id").agg(pl.col("item_id")).to_dict(as_series=False)
        )
        self.user2items = dict(zip(self.user2items["user_id"], self.user2items["item_id"]))

    def evaluate(self, predictions: Dict[int, List[int]]) -> Dict[str, float]:
        results = {}
        for subset in ["cold", "warm"]:
            recalls, precisions, hits, ndcgs, mrrs = [], [], [], [], []
            for user_id, y_pred in predictions.items():
                y_true = self.user2items.get(user_id, [])
                if not y_true:
                    continue
                if subset == "cold":
                    y_true = [i for i in y_true if i in self.cold_items]
                elif subset == "warm":
                    y_true = [i for i in y_true if i not in self.cold_items]
                if not y_true:
                    continue
                recalls.append(self.recall_at_k(y_true, y_pred))
                precisions.append(self.precision_at_k(y_true, y_pred))
                hits.append(self.hitrate_at_k(y_true, y_pred))
                ndcgs.append(self.ndcg_at_k(y_true, y_pred))
                mrrs.append(self.mrr_at_k(y_true, y_pred))
            results[f"Recall@10_{subset}"] = np.mean(recalls) if recalls else 0.0
            results[f"Precision@10_{subset}"] = np.mean(precisions) if precisions else 0.0
            results[f"HitRate@10_{subset}"] = np.mean(hits) if hits else 0.0
            results[f"NDCG@10_{subset}"] = np.mean(ndcgs) if ndcgs else 0.0
            results[f"MRR@10_{subset}"] = np.mean(mrrs) if mrrs else 0.0
        results["Coverage"] = self.coverage(predictions)
        self.print_metrics(results)

#### Вспомогательные функции для удобного представления результатов

Необходимы для составления табличек

In [6]:
def _shorten_list(lst, max_len=10):
    """Обрезает длинные списки для красивого вывода"""
    if lst is None:
        return []
    return lst[:max_len] if len(lst) > max_len else lst

def show_predictions(models: dict, data: pl.DataFrame, n=5, verbose=True, is_val=False):
    df = data.sample(n).select(["user_id", "item_id"])
    if is_val:
        df = df.rename({"item_id": "true_items"})

    # добавляем предсказания
    for name, preds in models.items():
        df = df.with_columns(
            pl.col("user_id").map_elements(
                lambda u: _shorten_list(preds.get(u, [])), 
                return_dtype=pl.List(pl.Int64)
            ).alias(name)
        )

    if verbose:
        print(df.shape)
        print(df)

    return df


def val_predictions(models: dict, val: pl.DataFrame, validator: Validator, k: int = 10, verbose: bool = True):
    results = []
    user2items = (
        val.group_by("user_id").agg(pl.col("item_id")).to_dict(as_series=False)
    )
    user2items = dict(zip(user2items["user_id"], user2items["item_id"]))

    for model_name, preds in models.items():
        recalls, precisions, hits, ndcgs, mrrs = [], [], [], [], []
        for u, y_true in user2items.items():
            y_pred = preds.get(u, [])
            recalls.append(validator.recall_at_k(y_true, y_pred, k))
            precisions.append(validator.precision_at_k(y_true, y_pred, k))
            hits.append(validator.hitrate_at_k(y_true, y_pred, k))
            ndcgs.append(validator.ndcg_at_k(y_true, y_pred, k))
            mrrs.append(validator.mrr_at_k(y_true, y_pred, k))
        metrics = {
            "model": model_name,
            "Recall@10": np.mean(recalls),
            "Precision@10": np.mean(precisions),
            "HitRate@10": np.mean(hits),
            "NDCG@10": np.mean(ndcgs),
            "MRR@10": np.mean(mrrs),
            "Coverage": validator.coverage(preds),
        }
        results.append(metrics)

    df = pl.DataFrame(results)
    if verbose:
        print(df)
    return df


### ALS и LightFM 

#### Определение базового класса для методов

In [7]:
class Recommender(ABC):
    @abstractmethod
    def fit(self, train_data: pl.DataFrame):
        pass
    
    @abstractmethod
    def predict(self, users: List[int], k: int = 10) -> Dict[int, List[int]]:
        pass

#### ALS

In [8]:
class ALSRecommender(Recommender):
    def __init__(self, factors=32, iterations=10, regularization=0.01, num_threads=4):
        self.factors = factors
        self.iterations = iterations
        self.regularization = regularization
        self.num_threads = num_threads
        self.model = None
        self.user_map = None
        self.item_map = None
        self.items = None
        self.users = None
        self.popular = None
        self.user_items = None
        
    def fit(self, train_data: pl.DataFrame):
        # Convert to pandas for compatibility
        train_pd = train_data.to_pandas()
        
        # Create mappings
        self.users = sorted(train_pd['user_id'].unique())
        self.items = sorted(train_pd['item_id'].unique())
        self.user_map = {u: i for i, u in enumerate(self.users)}
        self.item_map = {it: j for j, it in enumerate(self.items)}
        
        # Compute popular items for fallback
        item_counts = train_data.group_by("item_id").agg(pl.len().alias("count")).sort("count", descending=True)
        self.popular = item_counts["item_id"].head(10).to_list()
        
        # Create sparse matrix (rows=users, cols=items)  # <-- FIXED HERE
        rows = [self.user_map[u] for u in train_pd['user_id']]
        cols = [self.item_map[it] for it in train_pd['item_id']]
        data = np.ones(len(train_pd))
        user_items = csr_matrix((data, (rows, cols)), shape=(len(self.users), len(self.items)))
        
        # Train ALS
        self.model = AlternatingLeastSquares(
            factors=self.factors,
            iterations=self.iterations,
            regularization=self.regularization,
            num_threads=self.num_threads
        )
        self.model.fit(user_items)
        self.user_items = user_items  # <-- No transpose needed now
        
    def predict(self, users: List[int], k: int = 10) -> Dict[int, List[int]]:
        predictions = {}
        for user in tqdm(users, desc="ALS predictions"):
            if user not in self.user_map:
                predictions[user] = self.popular
                continue
            userid = self.user_map[user]
            try:
                # Ensure user_items slice is valid
                user_interactions = self.user_items[userid]
                recs, _ = self.model.recommend(
                    userid, 
                    user_interactions, 
                    N=k, 
                    filter_already_liked_items=True
                )
                predictions[user] = [self.items[r] for r in recs]
            except IndexError:
                # Fallback to popular items if index error occurs
                predictions[user] = self.popular
        return predictions

#### LightFM

In [9]:
class LightFMRecommender(Recommender):
    def __init__(self, no_components=16, loss='warp', learning_rate=0.05, num_threads=1):  # Changed no_components to 16, num_threads to 1
        self.no_components = no_components
        self.loss = loss
        self.learning_rate = learning_rate
        self.num_threads = num_threads
        self.model = None
        self.dataset = None
        self.user_map = None
        self.item_map = None
        self.items = None
        self.popular = None
        
    def fit(self, train_data: pl.DataFrame):
        # Convert to pandas
        train_pd = train_data.to_pandas()
        
        # Create mappings
        users = sorted(train_pd['user_id'].unique())
        self.items = sorted(train_pd['item_id'].unique())
        self.user_map = {u: i for i, u in enumerate(users)}
        self.item_map = {it: j for j, it in enumerate(self.items)}
        
        # Compute popular items for fallback
        item_counts = train_data.group_by("item_id").agg(pl.len().alias("count")).sort("count", descending=True)
        self.popular = item_counts["item_id"].head(10).to_list()
        
        # Prepare LightFM data
        self.dataset = LightFMDataset()
        self.dataset.fit(users=users, items=self.items)
        
        # Build interactions incrementally to save memory
        interactions = None
        batch_size = 100000  # Process 100k interactions at a time
        for start in tqdm(range(0, len(train_pd), batch_size), desc="Building interactions"):
            batch = train_pd[start:start + batch_size][['user_id', 'item_id']]
            batch_interactions = self.dataset.build_interactions(zip(batch['user_id'], batch['item_id']))[0]
            if interactions is None:
                interactions = batch_interactions
            else:
                interactions += batch_interactions
        
        # Train LightFM
        self.model = LightFM(
            no_components=self.no_components,
            loss=self.loss,
            learning_rate=self.learning_rate,
            learning_schedule='adagrad'
        )
        self.model.fit(interactions, epochs=5, num_threads=self.num_threads)  # Reduced to 5 epochs
        
    def predict(self, users: List[int], k: int = 10) -> Dict[int, List[int]]:
        predictions = {}
        item_ids_mapped = np.arange(len(self.items))
        
        # Warm users
        warm_users = [u for u in users if u in self.user_map]
        userids_mapped = np.array([self.user_map[u] for u in warm_users])
        
        if len(userids_mapped) > 0:
            # Batch predict with smaller batches to avoid memory issues
            batch_size = 1000  # Predict for 1000 users at a time
            for start in tqdm(range(0, len(userids_mapped), batch_size), desc="LightFM predictions"):
                batch_users = userids_mapped[start:start + batch_size]
                batch_warm_users = warm_users[start:start + batch_size]
                user_ids_arr = np.repeat(batch_users, len(self.items))
                item_ids_arr = np.tile(item_ids_mapped, len(batch_users))
                scores_flat = self.model.predict(user_ids_arr, item_ids_arr)
                scores = scores_flat.reshape(len(batch_users), len(self.items))
                top_items = np.argsort(-scores, axis=1)[:, :k]
                for idx, u in enumerate(batch_warm_users):
                    recs = [self.items[r] for r in top_items[idx]]
                    predictions[u] = recs
        
        # Cold users
        for user in users:
            if user not in predictions:
                predictions[user] = self.popular
                
        return predictions

### Pipeline

In [10]:
print(f"Train users: {train['user_id'].n_unique()}, items: {len(train_items)}")
print(f"Test users: {test['user_id'].n_unique()}, items: {len(test_items)}")
print(f"Cold items: {len(cold_items)}")
user_ids = test["user_id"].unique().to_list()
train_sample = train.sample(fraction=0.1)

als_recommender = ALSRecommender(factors=32, iterations=10, regularization=0.01, num_threads=4)
lightfm_recommender = LightFMRecommender(no_components=32, loss='warp', learning_rate=0.05, num_threads=4)
als_recommender.fit(train_sample)
lightfm_recommender.fit(train_sample)

test_users = test["user_id"].unique().to_list()
pred_als = als_recommender.predict(test_users)
pred_lightfm = lightfm_recommender.predict(test_users)

models = {
    "ALS": pred_als,
    "LightFM": pred_lightfm,
}


print("\nРекомендации моделей:")
train_df = show_predictions(models, train, n=5, verbose=True)

Train users: 349719, items: 31300
Test users: 185828, items: 27367
Cold items: 1775


100%|██████████| 10/10 [00:04<00:00,  2.06it/s]
Building interactions: 100%|██████████| 12/12 [00:01<00:00,  9.82it/s]
ALS predictions: 100%|██████████| 185828/185828 [00:13<00:00, 14203.53it/s]
LightFM predictions: 100%|██████████| 136/136 [11:09<00:00,  4.92s/it]


Рекомендации моделей:
(5, 4)
shape: (5, 4)
┌─────────────────────────────────┬─────────┬────────────────────────┬─────────────────────────┐
│ user_id                         ┆ item_id ┆ ALS                    ┆ LightFM                 │
│ ---                             ┆ ---     ┆ ---                    ┆ ---                     │
│ str                             ┆ i64     ┆ list[i64]              ┆ list[i64]               │
╞═════════════════════════════════╪═════════╪════════════════════════╪═════════════════════════╡
│ a4e96406524159b6861d75b2e96bcc… ┆ 14656   ┆ [1375, 2672, … 25039]  ┆ [15514, 18150, … 28386] │
│ 34d07ca7cb4b1f164e160affaac11b… ┆ 4126    ┆ [27524, 9722, … 1274]  ┆ [18150, 15514, … 9147]  │
│ 55e247ca12f56ebab6ad9fa125e9f6… ┆ 3513    ┆ []                     ┆ []                      │
│ c3ce4d3f53fdc6f4448748f34142d3… ┆ 7851    ┆ [11592, 4917, … 29055] ┆ [18150, 33851, … 24417] │
│ 73ac133c8a3d1534afb28e66579b3f… ┆ 18150   ┆ []                     ┆ []          




In [11]:
print("\nСтатистика по метрикам для каждой модели:")
validator = JointValidator(train, test, cold_items)
metrics_df = val_predictions(models, test, validator, k=10, verbose=True)


Статистика по метрикам для каждой модели:
shape: (2, 7)
┌─────────┬───────────┬──────────────┬────────────┬──────────┬──────────┬──────────┐
│ model   ┆ Recall@10 ┆ Precision@10 ┆ HitRate@10 ┆ NDCG@10  ┆ MRR@10   ┆ Coverage │
│ ---     ┆ ---       ┆ ---          ┆ ---        ┆ ---      ┆ ---      ┆ ---      │
│ str     ┆ f64       ┆ f64          ┆ f64        ┆ f64      ┆ f64      ┆ f64      │
╞═════════╪═══════════╪══════════════╪════════════╪══════════╪══════════╪══════════╡
│ ALS     ┆ 0.036348  ┆ 0.033381     ┆ 0.217454   ┆ 0.043746 ┆ 0.082427 ┆ 0.005048 │
│ LightFM ┆ 0.03382   ┆ 0.028511     ┆ 0.183325   ┆ 0.036579 ┆ 0.062809 ┆ 0.029936 │
└─────────┴───────────┴──────────────┴────────────┴──────────┴──────────┴──────────┘
