In [None]:
import os
import gc
import joblib
import polars as pl
import numpy as np

from tqdm import tqdm
from collections import defaultdict
from scipy.sparse import csr_matrix
from lightfm import LightFM
from lightfm.data import Dataset
from catboost import CatBoostRanker
from lightgbm import LGBMRanker

## Recommender функция

In [None]:
class Recommender:
    def __init__(self, data_path="data/"):
        self.data_path = data_path
        self.books = pl.read_parquet(f"{data_path}books.pq")
        self.train_interactions = pl.read_parquet(f"{data_path}train.pq")
        self.test_interactions = pl.read_parquet(f"{data_path}test.pq")
        self.user_map = {uid: i for i, uid in enumerate(self.train_interactions["user_id"].unique())}
        self.item_map = {iid: i for i, iid in enumerate(self.books["item_id"].unique())}
        self.n_users = len(self.user_map)
        self.n_items = len(self.item_map)
        self.models = {}

    def _clear_memory(self):
        gc.collect()
        self.books = None
        self.train_interactions = None
        self.test_interactions = None

    def _prepare_features(self):
        tag_counts = self.books["tags"].explode().value_counts()
        top_tags = tag_counts.filter(pl.col("count") > 100)["tags"].to_list()
        self.books = self.books.with_columns([
            pl.col("tags").list.set_intersection(top_tags).alias("filtered_tags")
        ])
        return self.books

    def save_model(self, model_name, model):
        os.makedirs("models", exist_ok=True)
        joblib.dump(model, f"models/{model_name}.joblib")

    def save_predictions(self, predictions, file_name):
        os.makedirs("predictions", exist_ok=True)
        pl.DataFrame({"user_id": predictions["user_id"], "item_id": predictions["item_id"], "score": predictions["score"]}) \
            .write_parquet(f"predictions/{file_name}.pq")

    @staticmethod
    def evaluate_predictions(test_data, pred_data, k=10):
        # Ground truth: items each user interacted with in test set
        test_dict = defaultdict(set)
        for row in test_data.iter_rows(named=True):
            test_dict[row["user_id"]].add(row["item_id"])

        # Sort predictions by score and take top-k
        pred_data = pred_data.sort(["user_id", "score"], descending=[False, True])
        pred_dict = defaultdict(list)
        for row in pred_data.group_by("user_id").head(k).iter_rows(named=True):
            pred_dict[row["user_id"]].append(row["item_id"])

        # Metrics
        precision = recall = hit_rate = ndcg = mrr = coverage = 0.0
        n_users = len(test_dict)
        relevant_items = set(test_data["item_id"].unique())

        for user_id, true_items in test_dict.items():
            pred_items = pred_dict.get(user_id, [])
            hits = len(set(pred_items) & true_items)
            precision += hits / min(k, len(pred_items)) if pred_items else 0
            recall += hits / len(true_items) if true_items else 0
            hit_rate += 1 if hits > 0 else 0
            if pred_items:
                dcg = 0.0
                idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(true_items), k)))
                for i, item in enumerate(pred_items[:k]):
                    if item in true_items:
                        dcg += 1.0 / np.log2(i + 2)
                ndcg += dcg / idcg if idcg > 0 else 0
                for i, item in enumerate(pred_items[:k], 1):
                    if item in true_items:
                        mrr += 1.0 / i
                        break
            coverage += len(set(pred_items) & relevant_items) / len(relevant_items) if relevant_items else 0

        return {
            "Precision@10": precision / n_users,
            "Recall@10": recall / n_users,
            "HitRate@10": hit_rate / n_users,
            "NDCG@10": ndcg / n_users,
            "MRR@10": mrr / n_users,
            "Coverage": coverage / n_users
        }

## ALS

In [None]:
class ALSRecommender(Recommender):
    def train(self):
        books = self._prepare_features()
        interactions = self.train_interactions.join(books.select(["item_id", "filtered_tags"]), on="item_id")
        user_ids = interactions["user_id"].apply(self.user_map.get)
        item_ids = interactions["item_id"].apply(self.item_map.get)
        ratings = interactions["rating"].fill_null(0).to_numpy()
        rows = user_ids.to_numpy()
        cols = item_ids.to_numpy()
        R = csr_matrix((ratings, (rows, cols)), shape=(self.n_users, self.n_items))
        from implicit.als import AlternatingLeastSquares
        model = AlternatingLeastSquares(factors=50, iterations=15)
        with tqdm(total=15, desc="ALS Training") as pbar:
            for _ in range(15):
                model.fit(R)
                pbar.update(1)
        self.models["ALS"] = model
        self.save_model("ALS", model)
        return model

    def predict(self):
        model = self.models["ALS"]
        test_users = self.test_interactions["user_id"].apply(self.user_map.get).to_numpy()
        test_items = self.test_interactions["item_id"].apply(self.item_map.get).to_numpy()
        scores = model.predict(test_users, test_items)
        predictions = pl.DataFrame({
            "user_id": self.test_interactions["user_id"],
            "item_id": self.test_interactions["item_id"],
            "score": scores
        })
        self.save_predictions(predictions, "ALS_predictions")
        self._clear_memory()
        return predictions

## EASE

In [None]:
class EASERecommender(Recommender):
    def train(self):
        books = self._prepare_features()
        interactions = self.train_interactions.join(books.select(["item_id", "filtered_tags"]), on="item_id")
        user_ids = interactions["user_id"].apply(self.user_map.get)
        item_ids = interactions["item_id"].apply(self.item_map.get)
        ratings = interactions["rating"].fill_null(0).to_numpy()
        R = csr_matrix((ratings, (user_ids.to_numpy(), item_ids.to_numpy())), shape=(self.n_users, self.n_items))
        from implicit.evaluation import train_test_split
        R_train, R_test = train_test_split(R, train_percentage=1.0)
        # Simplified EASE (placeholder, replace with actual implementation)
        self.models["EASE"] = None
        with tqdm(total=1, desc="EASE Training") as pbar:
            pbar.update(1)  # Placeholder progress
        self.save_model("EASE", None)
        return None

    def predict(self):
        test_users = self.test_interactions["user_id"].apply(self.user_map.get).to_numpy()
        test_items = self.test_interactions["item_id"].apply(self.item_map.get).to_numpy()
        scores = np.random.rand(len(test_users))
        predictions = pl.DataFrame({
            "user_id": self.test_interactions["user_id"],
            "item_id": self.test_interactions["item_id"],
            "score": scores
        })
        self.save_predictions(predictions, "EASE_predictions")
        self._clear_memory()
        return predictions

## LightFM

In [None]:
class LightFMRecommender(Recommender):
    def train(self):
        books = self._prepare_features()
        dataset = Dataset()
        dataset.fit(
            users=self.train_interactions["user_id"].unique(),
            items=self.books["item_id"].unique(),
            item_features=[(tag, [1]) for tag in books["filtered_tags"].explode().unique()]
        )
        interactions = self.train_interactions.join(books.select(["item_id", "filtered_tags"]), on="item_id")
        (interactions_mat, weights) = dataset.build_interactions(
            [(uid, iid, r) for uid, iid, r in zip(interactions["user_id"], interactions["item_id"], interactions["rating"])]
        )
        item_features = dataset.build_item_features(
            [(iid, tags) for iid, tags in zip(books["item_id"], books["filtered_tags"])]
        )
        model = LightFM(no_components=50, loss='warp')
        with tqdm(total=15, desc="LightFM Training") as pbar:
            for epoch in range(15):
                model.fit_partial(interactions_mat, item_features=item_features, epochs=1)
                pbar.update(1)
        self.models["LightFM"] = model
        self.save_model("LightFM", model)
        return model

    def predict(self):
        model = self.models["LightFM"]
        test_users = self.test_interactions["user_id"].to_list()
        test_items = self.test_interactions["item_id"].to_list()
        scores = model.predict([self.user_map[u] for u in test_users], [self.item_map[i] for i in test_items])
        predictions = pl.DataFrame({
            "user_id": test_users,
            "item_id": test_items,
            "score": scores
        })
        self.save_predictions(predictions, "LightFM_predictions")
        self._clear_memory()
        return predictions

## CatBoost

In [None]:
class CatBoostLTRRecommender(Recommender):
    def train(self):
        books = self._prepare_features()
        interactions = self.train_interactions.join(books.select(["item_id", "filtered_tags"]), on="item_id")
        groups = interactions.group_by("user_id").agg(pl.len()).sort("user_id")["len"].to_list()
        X = interactions.select(["item_id", "filtered_tags", "is_read", "rating"]).to_pandas()
        y = X.pop("rating")
        model = CatBoostRanker(iterations=100, learning_rate=0.1, loss_function='YetiRank')
        with tqdm(total=100, desc="CatBoostLTR Training") as pbar:
            model.fit(X, y, group_id=groups, verbose=False, callback=lambda x: pbar.update(1))
        self.models["CatBoostLTR"] = model
        self.save_model("CatBoostLTR", model)
        return model

    def predict(self):
        model = self.models["CatBoostLTR"]
        test_data = self.test_interactions.join(self.books.select(["item_id", "filtered_tags"]), on="item_id")
        X_test = test_data.select(["item_id", "filtered_tags", "is_read"]).to_pandas()
        scores = model.predict(X_test)
        predictions = pl.DataFrame({
            "user_id": test_data["user_id"],
            "item_id": test_data["item_id"],
            "score": scores
        })
        self.save_predictions(predictions, "CatBoostLTR_predictions")
        self._clear_memory()
        return predictions

## Train Intitial

In [None]:
def evaluate_all_models(data_path="data/", pred_path="predictions/"):
    test_data = pl.read_parquet(f"{data_path}test.pq")
    models = ["ALS", "EASE", "LightFM", "CatBoostLTR"]
    results = []

    for model in models:
        pred_data = pl.read_parquet(f"{pred_path}{model}_predictions.pq")
        metrics = Recommender.evaluate_predictions(test_data, pred_data)
        results.append({"model": model, **metrics})

    df = pl.DataFrame(results)
    return df

In [None]:
recommender_types = [ALSRecommender, EASERecommender, LightFMRecommender, CatBoostLTRRecommender]
for recommender_class in recommender_types:
    recommender = recommender_class()
    print(f"Training {recommender_class.__name__}")
    recommender.train()
    print(f"Predicting with {recommender_class.__name__}")
    recommender.predict()

In [None]:
print('Evaluating Models:')
metrix_df = evaluate_all_models()
print(metrix_df)