In [1]:
import os
import joblib
import polars as pl
import pandas as pd
import numpy as np
import networkx as nx


from collections import Counter
from sklearn.decomposition import TruncatedSVD
from typing import List, Dict


try:
    import implicit
    HAS_IMPLICIT = True
except Exception:
    HAS_IMPLICIT = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_folder = "../../../data/"
train = pl.read_parquet(data_folder + "train.pq")
test_exploded = pl.read_parquet(data_folder + "test.pq")
test = test_exploded.group_by("user_id", maintain_order=True).agg(pl.col("item_id"))
books = pl.read_parquet(data_folder + "books.pq")

In [4]:
train_pd = train.to_pandas()
test_pd = test.to_pandas()
books_pd = books.to_pandas()

# Приведение типов
# Преобразуем user_id и item_id в строку (если вдруг числа — всё равно ок)
train_pd["user_id"] = train_pd["user_id"].astype(str)
train_pd["item_id"] = train_pd["item_id"].astype(str)
test_pd["user_id"] = test_pd["user_id"].astype(str)
books_pd["item_id"] = books_pd["item_id"].astype(str)

# Собираем уникальные id
user_ids = train_pd["user_id"].unique()
item_ids = train_pd["item_id"].unique()

# Маппинги
user_to_idx = {u: i for i, u in enumerate(user_ids)}
idx_to_user = {i: u for u, i in user_to_idx.items()}
item_to_idx = {i: j for j, i in enumerate(item_ids)}
idx_to_item = {j: i for i, j in item_to_idx.items()}


In [5]:
class PopularityRecommender:
    def __init__(self):
        self.popularity = None

    def fit(self, df: pd.DataFrame):
        self.popularity = df["item_id"].value_counts().to_dict()
        return self

    def recommend(self, user_id: int, top_k: int = 10):
        return [i for i, _ in Counter(self.popularity).most_common(top_k)]

    def save(self, path: str):
        joblib.dump(self.popularity, path)

    def load(self, path: str):
        self.popularity = joblib.load(path)

In [6]:
class SVDRecommender:
    def __init__(self, n_components=50):
        self.n_components = n_components
        self.svd = None
        self.user_factors = None
        self.item_factors = None

    def fit(self, df: pd.DataFrame):
        matrix = np.zeros((len(user_ids), len(item_ids)))
        for row in df.itertuples():
            matrix[user_to_idx[row.user_id], item_to_idx[row.item_id]] = 1

        self.svd = TruncatedSVD(n_components=self.n_components, random_state=42)
        self.user_factors = self.svd.fit_transform(matrix)
        self.item_factors = self.svd.components_.T
        return self

    def recommend(self, user_id: int, top_k: int = 10):
        if user_id not in user_to_idx:
            return []
        uvec = self.user_factors[user_to_idx[user_id]]
        scores = self.item_factors.dot(uvec)
        top_items = np.argsort(scores)[::-1][:top_k]
        return [item_ids[i] for i in top_items]

    def save(self, path: str):
        joblib.dump(self.svd, path)

    def load(self, path: str):
        self.svd = joblib.load(path)

In [7]:
if HAS_IMPLICIT:
    from scipy.sparse import csr_matrix
    from implicit.als import AlternatingLeastSquares

    class ALSRecommender:
        def __init__(self, factors=50):
            self.factors = factors
            self.model = AlternatingLeastSquares(factors=factors)

        def fit(self, df: pd.DataFrame):
            rows = df["user_id"].map(user_to_idx)
            cols = df["item_id"].map(item_to_idx)
            data = np.ones(len(df))
            mat = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(item_ids)))
            self.model.fit(mat)
            return self

        def recommend(self, user_id: int, top_k: int = 10):
            if user_id not in user_to_idx:
                return []
            recs, _ = self.model.recommend(user_to_idx[user_id], csr_matrix((1, len(item_ids))), N=top_k)
            return [idx_to_item[i] for i in recs]

        def save(self, path: str):
            joblib.dump(self.model, path)

        def load(self, path: str):
            self.model = joblib.load(path)

In [8]:
class GraphRecommender:
    def __init__(self):
        self.G = nx.Graph()

    def fit(self, df: pd.DataFrame):
        for r in df.itertuples():
            self.G.add_edge(f"u_{r.user_id}", f"i_{r.item_id}")
        return self

    def recommend(self, user_id: int, top_k: int = 10):
        start_node = f"u_{user_id}"
        if start_node not in self.G:
            return []
        pr = nx.pagerank(self.G, alpha=0.85, personalization={start_node: 1})
        recs = [n for n in sorted(pr, key=pr.get, reverse=True) if n.startswith("i_")]
        return [int(n[2:]) for n in recs[:top_k]]

    def save(self, path: str):
        nx.write_gpickle(self.G, path)

    def load(self, path: str):
        self.G = nx.read_gpickle(path)

In [9]:
def recommend_cold(item_id: int, top_k: int = 10):
    if item_id not in books_pd["item_id"].values:
        return []
    genre = books_pd.loc[books_pd["item_id"] == item_id, "genre"].values[0]
    candidates = books_pd[books_pd["genre"] == genre]["item_id"].tolist()
    return candidates[:top_k]

In [10]:
def precision_at_k(pred: List[int], true: List[int], k: int = 10) -> float:
    return len(set(pred[:k]) & set(true)) / k

def recall_at_k(pred: List[int], true: List[int], k: int = 10) -> float:
    return len(set(pred[:k]) & set(true)) / max(1, len(true))

In [14]:
def evaluate_models(train_df: pd.DataFrame, test_df: pd.DataFrame, models: Dict[str, object], top_k: int = 10, save_dir: str = "models"):
    os.makedirs(save_dir, exist_ok=True)
    results = []

    def ensure_list(x):
        if isinstance(x, (list, tuple)):
            return list(x)
        if isinstance(x, np.ndarray):
            return x.tolist()
        return [x]

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(train_df)
        model_path = os.path.join(save_dir, f"{name}.joblib")
        try:
            model.save(model_path)
        except Exception:
            pass

        precisions, recalls = [], []
        for row in test_df.itertuples():
            uid = row.user_id
            true_items = ensure_list(row.item_id)
            preds = model.recommend(uid, top_k=top_k)
            precisions.append(precision_at_k(preds, true_items, top_k))
            recalls.append(recall_at_k(preds, true_items, top_k))

        results.append({
            "model": name,
            "precision@k": np.mean(precisions),
            "recall@k": np.mean(recalls)
        })

    return pd.DataFrame(results)


In [15]:
models = {
    "popularity": PopularityRecommender(),
    "svd": SVDRecommender(n_components=50),
    "graph": GraphRecommender(),
}
if HAS_IMPLICIT:
    models["als"] = ALSRecommender(factors=50)

results = evaluate_models(train_pd, test_pd, models, top_k=10, save_dir="models")
print(results)

Training popularity...
Training svd...


  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = qr_normalizer(A @ Q)
  Q, _ = qr_normalizer(A @ Q)
  Q, _ = qr_normalizer(A @ Q)
  B = Q.T @ M
  B = Q.T @ M
  B = Q.T @ M
  U = Q @ Uhat
  U = Q @ Uhat
  U = Q @ Uhat
  ret = a @ b
  ret = a @ b
  ret = a @ b


: 