In [None]:
import os
import joblib
import polars as pl
import pandas as pd
import numpy as np
import networkx as nx


from tqdm import tqdm
from collections import Counter
from scipy.sparse import csr_matrix
from surprise import Dataset, Reader, SVD
from sklearn.decomposition import IncrementalPCA
from sklearn.decomposition import TruncatedSVD
from typing import List, Dict


try:
    import implicit
    HAS_IMPLICIT = True
except Exception:
    HAS_IMPLICIT = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_folder = "../../../data/"
train = pl.read_parquet(data_folder + "train.pq")
test_exploded = pl.read_parquet(data_folder + "test.pq")
test = test_exploded.group_by("user_id", maintain_order=True).agg(pl.col("item_id"))
books = pl.read_parquet(data_folder + "books.pq")

In [3]:
train_pd = train.to_pandas()
test_pd = test.to_pandas()
books_pd = books.to_pandas()

# Приведение типов
# Преобразуем user_id и item_id в строку (если вдруг числа — всё равно ок)
train_pd["user_id"] = train_pd["user_id"].astype(str)
train_pd["item_id"] = train_pd["item_id"].astype(str)
test_pd["user_id"] = test_pd["user_id"].astype(str)
books_pd["item_id"] = books_pd["item_id"].astype(str)

# Собираем уникальные id
user_ids = train_pd["user_id"].unique()
item_ids = train_pd["item_id"].unique()

# Маппинги
user_to_idx = {u: i for i, u in enumerate(user_ids)}
idx_to_user = {i: u for u, i in user_to_idx.items()}
item_to_idx = {i: j for j, i in enumerate(item_ids)}
idx_to_item = {j: i for i, j in item_to_idx.items()}


In [4]:
class PopularityRecommender:
    def __init__(self):
        self.counter = Counter()

    def partial_fit(self, df: pd.DataFrame):
        self.counter.update(df["item_id"].tolist())
        return self

    def fit(self, df: pd.DataFrame, batch_size: int = 100000):
        for start in range(0, len(df), batch_size):
            chunk = df.iloc[start:start+batch_size]
            self.partial_fit(chunk)
        return self

    def recommend(self, user_id: str, top_k: int = 10):
        return [i for i, _ in self.counter.most_common(top_k)]


In [None]:
class SVDRecommender:
    def __init__(self, n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02):
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.lr_all = lr_all
        self.reg_all = reg_all
        self.model = SVD(n_factors=n_factors, n_epochs=1, lr_all=lr_all, reg_all=reg_all)  
        self.trainset = None
        self.user_item_map = None  

    def fit(self, df: pd.DataFrame):
        df_tmp = df.copy()
        if "rating" not in df_tmp.columns:
            df_tmp["rating"] = 1.0

        reader = Reader(rating_scale=(0, 1))
        data = Dataset.load_from_df(df_tmp[["user_id", "item_id", "rating"]], reader)
        self.trainset = data.build_full_trainset()
        for epoch in tqdm(range(self.n_epochs), desc="Training SVD"):
            self.model.train(self.trainset)

        self.user_item_map = df.groupby("user_id")["item_id"].apply(set).to_dict()
        return self

    def recommend(self, user_id: str, top_k: int = 10):
        if self.trainset is None:
            return []

        if user_id not in self.trainset._raw2inner_id_users:
            return []  # cold-start user

        inner_uid = self.trainset.to_inner_uid(user_id)
        all_items = self.trainset.all_items()
        rated = self.trainset.ur[inner_uid]  
        rated_items = {iid for iid, _ in rated}

        preds = []
        for iid in all_items:
            if iid not in rated_items:  
                est = self.model.predict(user_id, self.trainset.to_raw_iid(iid)).est
                preds.append((iid, est))

        preds_sorted = sorted(preds, key=lambda x: x[1], reverse=True)[:top_k]
        return [self.trainset.to_raw_iid(iid) for iid, _ in preds_sorted]

In [6]:
if HAS_IMPLICIT:
    from scipy.sparse import csr_matrix
    from implicit.als import AlternatingLeastSquares

    class ALSRecommender:
        def __init__(self, factors=50):
            self.factors = factors
            self.model = AlternatingLeastSquares(factors=factors, iterations=15, use_gpu=False, calculate_training_loss=True, num_threads=4)

        def fit(self, df: pd.DataFrame):
            rows = df["user_id"].map(user_to_idx)
            cols = df["item_id"].map(item_to_idx)
            data = np.ones(len(df))
            mat = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(item_ids)))
            self.model.fit(mat, show_progress=True, batch_size=10000)
            return self

        def recommend(self, user_id: int, top_k: int = 10):
            if user_id not in user_to_idx:
                return []
            recs, _ = self.model.recommend(user_to_idx[user_id], csr_matrix((1, len(item_ids))), N=top_k)
            return [idx_to_item[i] for i in recs]

        def save(self, path: str):
            joblib.dump(self.model, path)

        def load(self, path: str):
            self.model = joblib.load(path)

In [7]:
class GraphRecommender:
    def __init__(self):
        self.G = nx.Graph()

    def fit(self, df: pd.DataFrame, batch_size: int = 100000):
        for start in range(0, len(df), batch_size):
            chunk = df.iloc[start:start+batch_size]
            for r in chunk.itertuples():
                self.G.add_edge(f"u_{r.user_id}", f"i_{r.item_id}")
        return self

    def recommend(self, user_id: int, top_k: int = 10):
        start_node = f"u_{user_id}"
        if start_node not in self.G:
            return []
        pr = nx.pagerank(self.G, alpha=0.85, personalization={start_node: 1})
        recs = [n for n in sorted(pr, key=pr.get, reverse=True) if n.startswith("i_")]
        return [int(n[2:]) for n in recs[:top_k]]

    def save(self, path: str):
        nx.write_gpickle(self.G, path)

    def load(self, path: str):
        self.G = nx.read_gpickle(path)

In [8]:
def recommend_cold(item_id: int, top_k: int = 10):
    if item_id not in books_pd["item_id"].values:
        return []
    genre = books_pd.loc[books_pd["item_id"] == item_id, "genre"].values[0]
    candidates = books_pd[books_pd["genre"] == genre]["item_id"].tolist()
    return candidates[:top_k]

In [9]:
def precision_at_k(pred: List[int], true: List[int], k: int = 10) -> float:
    return len(set(pred[:k]) & set(true)) / k

def recall_at_k(pred: List[int], true: List[int], k: int = 10) -> float:
    return len(set(pred[:k]) & set(true)) / max(1, len(true))

In [10]:
from tqdm import tqdm

def evaluate_models(train_df, test_df, models, top_k=10, save_dir="models"):
    os.makedirs(save_dir, exist_ok=True)
    results = []

    def ensure_list(x):
        if isinstance(x, (list, tuple)):
            return list(x)
        if isinstance(x, np.ndarray):
            return x.tolist()
        return [x]

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(train_df)

        precisions, recalls = [], []
        for row in tqdm(test_df.itertuples(), total=len(test_df), desc=f"Evaluating {name}"):
            uid = row.user_id
            true_items = ensure_list(row.item_id)
            preds = model.recommend(uid, top_k=top_k)
            precisions.append(precision_at_k(preds, true_items, top_k))
            recalls.append(recall_at_k(preds, true_items, top_k))

        results.append({
            "model": name,
            "precision@k": np.mean(precisions),
            "recall@k": np.mean(recalls)
        })

    return pd.DataFrame(results)



In [11]:
models = {
    "popularity": PopularityRecommender(),
    "svd": SVDRecommender(n_components=50),
    "graph": GraphRecommender(),
}
if HAS_IMPLICIT:
    models["als"] = ALSRecommender(factors=50)

results = evaluate_models(train_pd, test_pd, models, top_k=10, save_dir="models")
print(results)

  check_blas_config()


Training popularity...


Evaluating popularity: 100%|██████████| 185828/185828 [03:27<00:00, 895.05it/s]


Training svd...


KeyboardInterrupt: 