# Метод ближайших соседей

In [1]:
import gc
import pickle 
import json 
import numpy as np
import pandas as pd
import polars as pl

from tqdm import tqdm
from annoy import AnnoyIndex
from lightfm import LightFM
from abc import ABC, abstractmethod
from typing import Dict, List
from collections import defaultdict, Counter 
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# Загрузка данных
train = pl.read_parquet("../../data/train.pq")
test = pl.read_parquet("../../data/test.pq")

train_items = set(train["item_id"].unique())
test_items = set(test["item_id"].unique())
cold_items = test_items - train_items

# Уменьшаем выборку до 25%
train_sample = train.sample(fraction=0.10, seed=42)

# Проверка данных до фильтрации
print(f"Initial train_sample size: {train_sample.shape}")
print(f"Initial unique users in train_sample: {train_sample['user_id'].n_unique()}")
print(f"Initial unique items in train_sample: {train_sample['item_id'].n_unique()}")

# Фильтрация пользователей с >= 3 взаимодействиями
user_counts = train_sample.group_by("user_id").agg(pl.col("item_id").count().alias("count"))
active_users = user_counts.filter(pl.col("count") >= 3)["user_id"]
print(f"Users with >= 3 interactions: {len(active_users)}")
train_sample = train_sample.filter(pl.col("user_id").is_in(active_users))

# Фильтрация айтемов с >= 3 взаимодействиями
item_counts = train_sample.group_by("item_id").agg(pl.col("user_id").count().alias("count"))
popular_items = item_counts.filter(pl.col("count") >= 3)["item_id"]
print(f"Items with >= 3 interactions: {len(popular_items)}")
train_sample = train_sample.filter(pl.col("item_id").is_in(popular_items))

# Проверка размеров после фильтрации
print(f"Train sample size: {train_sample.shape}")
print(f"Unique users in train_sample: {train_sample['user_id'].n_unique()}")
print(f"Unique items in train_sample: {train_sample['item_id'].n_unique()}")

# Проверка, что train_sample не пустой
if train_sample.shape[0] == 0:
    print("ERROR: train_sample is empty after filtering. Relaxing constraints...")
    train_sample = train.sample(fraction=0.25, seed=42)  # Без фильтрации
    print(f"Reset train_sample size: {train_sample.shape}")

# Проверка пересечения пользователей
train_users = set(train_sample["user_id"].unique())
test_users = set(test["user_id"].unique())
common_users = train_users.intersection(test_users)
print(f"Common users between train_sample and test: {len(common_users)}")
print(f"Percentage of test users in train_sample: {len(common_users) / len(test_users) * 100:.2f}%")

Initial train_sample size: (1197143, 5)
Initial unique users in train_sample: 218650
Initial unique items in train_sample: 24094
Users with >= 3 interactions: 107589
Items with >= 3 interactions: 16386
Train sample size: (1038880, 5)
Unique users in train_sample: 107586
Unique items in train_sample: 16386


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  train_sample = train_sample.filter(pl.col("user_id").is_in(active_users))
Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  train_sample = train_sample.filter(pl.col("item_id").is_in(popular_items))


Common users between train_sample and test: 80816
Percentage of test users in train_sample: 43.49%


### Метрики оценивания моделей

In [3]:
class Validator(ABC):
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame, cold_items: set = None):
        self.train = train
        self.test = test
        self.cold_items = cold_items or set()

    @abstractmethod
    def evaluate(self, predictions: Dict[int, List[int]]) -> Dict[str, float]:
        """
        predictions: dict user_id -> list of recommended item_ids
        """
        pass

    def recall_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        return len(set(y_true) & set(y_pred[:k])) / len(set(y_true)) if y_true else 0.0

    def precision_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        return len(set(y_true) & set(y_pred[:k])) / k if y_true else 0.0

    def hitrate_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        return 1.0 if len(set(y_true) & set(y_pred[:k])) > 0 else 0.0

    def ndcg_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        dcg = 0.0
        for i, item in enumerate(y_pred[:k]):
            if item in y_true:
                dcg += 1 / np.log2(i + 2)
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(y_true), k)))
        return dcg / idcg if idcg > 0 else 0.0

    def mrr_at_k(self, y_true: List[int], y_pred: List[int], k: int = 10) -> float:
        for i, item in enumerate(y_pred[:k]):
            if item in y_true:
                return 1 / (i + 1)
        return 0.0

    def coverage(self, predictions: Dict[int, List[int]]) -> float:
        all_pred_items = set(item for recs in predictions.values() for item in recs)
        all_train_items = set(self.train["item_id"].unique())
        return len(all_pred_items) / len(all_train_items)

    @staticmethod
    def print_metrics(metrics: Dict[str, float]):
        print("\n=== Evaluation Results ===")
        for key, value in metrics.items():
            print(f"{key:<15}: {value:.4f}")
        print("==========================\n")

Стоит разделить валидацию на две версии, для сравнения. Моя гипотеза заключается в том, что совместная валидация warm и cold может быть не совсем честной. Например, если в тесте 90% warm и 10% cold, то Recall@10 в среднем будет определяться warm-айтемами. Модель может полностью «забыть» про cold items, но в отчёте всё равно будут хорошие цифры. Это вводит в заблуждение: кажется, что модель универсальная, хотя на самом деле cold-start не решён.

#### Блок совместной валидации (baseline)

In [4]:
class JointValidator(Validator):
    def __init__(self, train: pl.DataFrame, test: pl.DataFrame, cold_items: set = None):
        super().__init__(train, test, cold_items)
        self.user2items = (
            test.group_by("user_id").agg(pl.col("item_id")).to_dict(as_series=False)
        )
        self.user2items = dict(zip(self.user2items["user_id"], self.user2items["item_id"]))

    def evaluate(self, predictions: Dict[int, List[int]]) -> Dict[str, float]:
        recalls, precisions, hits, ndcgs, mrrs = [], [], [], [], []
        for user_id, y_pred in predictions.items():
            y_true = self.user2items.get(user_id, [])
            recalls.append(self.recall_at_k(y_true, y_pred))
            precisions.append(self.precision_at_k(y_true, y_pred))
            hits.append(self.hitrate_at_k(y_true, y_pred))
            ndcgs.append(self.ndcg_at_k(y_true, y_pred))
            mrrs.append(self.mrr_at_k(y_true, y_pred))
        results = {
            "Recall@10": np.mean(recalls),
            "Precision@10": np.mean(precisions),
            "HitRate@10": np.mean(hits),
            "NDCG@10": np.mean(ndcgs),
            "MRR@10": np.mean(mrrs),
            "Coverage": self.coverage(predictions),
        }
        self.print_metrics(results)

#### Разделенная валидация (cold vs warm)

In [5]:
class SplitValidator(Validator):
    def __init__(self, train: pl.DataFrame, test: pl.DataFrame, cold_items: set = None):
        super().__init__(train, test, cold_items)
        self.user2items = (
            test.group_by("user_id").agg(pl.col("item_id")).to_dict(as_series=False)
        )
        self.user2items = dict(zip(self.user2items["user_id"], self.user2items["item_id"]))

    def evaluate(self, predictions: Dict[int, List[int]]) -> Dict[str, float]:
        results = {}
        for subset in ["cold", "warm"]:
            recalls, precisions, hits, ndcgs, mrrs = [], [], [], [], []
            for user_id, y_pred in predictions.items():
                y_true = self.user2items.get(user_id, [])
                if not y_true:
                    continue
                if subset == "cold":
                    y_true = [i for i in y_true if i in self.cold_items]
                elif subset == "warm":
                    y_true = [i for i in y_true if i not in self.cold_items]
                if not y_true:
                    continue
                recalls.append(self.recall_at_k(y_true, y_pred))
                precisions.append(self.precision_at_k(y_true, y_pred))
                hits.append(self.hitrate_at_k(y_true, y_pred))
                ndcgs.append(self.ndcg_at_k(y_true, y_pred))
                mrrs.append(self.mrr_at_k(y_true, y_pred))
            results[f"Recall@10_{subset}"] = np.mean(recalls) if recalls else 0.0
            results[f"Precision@10_{subset}"] = np.mean(precisions) if precisions else 0.0
            results[f"HitRate@10_{subset}"] = np.mean(hits) if hits else 0.0
            results[f"NDCG@10_{subset}"] = np.mean(ndcgs) if ndcgs else 0.0
            results[f"MRR@10_{subset}"] = np.mean(mrrs) if mrrs else 0.0
        results["Coverage"] = self.coverage(predictions)
        self.print_metrics(results)

### Функции алгоритмов KNN 

In [6]:
class BaseRecommender(ABC):
    @abstractmethod
    def fit(self, data: pl.DataFrame):
        pass

    @abstractmethod
    def predict(self, users: List[int], k: int = 10) -> Dict[int, List[int]]:
        pass

    def save_predictions(self, predictions: Dict[int, List[int]], filename: str):
        with open(filename, 'wb') as f:
            pickle.dump(predictions, f)
        print(f"Predictions saved to {filename} (pickle format)")

        with open(filename.replace('.pkl', '.json'), 'w') as f:
            json.dump(predictions, f, indent=4)
        print(f"Predictions also saved to {filename.replace('.pkl', '.json')} (JSON format)")

    def load_predictions(self, filename: str) -> Dict[int, List[int]]:
        with open(filename, 'rb') as f:
            return pickle.load(f)

In [7]:
class UserKNN(BaseRecommender):
    def __init__(self, n_neighbors=20, n_trees=10):
        self.n_neighbors = n_neighbors
        self.n_trees = n_trees
        self.user_index = None
        self.user_item_matrix = None
        self.user_id_map = None
        self.item_id_map = None

    def fit(self, data: pl.DataFrame):
        unique_users = data['user_id'].unique().to_list()
        unique_items = data['item_id'].unique().to_list()
        self.user_id_map = {uid: idx for idx, uid in enumerate(unique_users)}
        self.item_id_map = {iid: idx for idx, iid in enumerate(unique_items)}
        print(f"Unique users: {len(unique_users)}, Unique items: {len(unique_items)}")

        rows = data['user_id'].map_elements(lambda u: self.user_id_map.get(u, -1), return_dtype=pl.Int32).to_numpy()
        cols = data['item_id'].map_elements(lambda i: self.item_id_map.get(i, -1), return_dtype=pl.Int32).to_numpy()
        valid_mask = (rows != -1) & (cols != -1)
        rows, cols = rows[valid_mask], cols[valid_mask]
        vals = np.ones(len(rows))
        self.user_item_matrix = csr_matrix((vals, (rows, cols)), shape=(len(unique_users), len(unique_items)))
        print(f"User-item matrix shape: {self.user_item_matrix.shape}")
        print(f"User-item matrix memory usage: {self.user_item_matrix.data.nbytes / 1024**2:.2f} MB")

        n_features = len(unique_items)
        self.user_index = AnnoyIndex(n_features, 'angular')
        for user_idx in tqdm(range(len(unique_users)), desc="Building Annoy index for users"):
            user_vector = self.user_item_matrix[user_idx].toarray().flatten()
            self.user_index.add_item(user_idx, user_vector)
        self.user_index.build(self.n_trees)
        print("Annoy index for users built.")
        gc.collect()

    def predict(self, users: List[int], k: int = 10) -> Dict[int, List[int]]:
        global popular_items
        predictions = {}
        reverse_item_map = {v: k for k, v in self.item_id_map.items()}
        for user_id in tqdm(users, desc="Predicting with UserKNN (Annoy)"):
            if user_id not in self.user_id_map:
                predictions[user_id] = popular_items[:k]
                continue
            user_idx = self.user_id_map[user_id]
            similar_users, distances = self.user_index.get_nns_by_item(user_idx, self.n_neighbors + 1, include_distances=True)
            similar_users = similar_users[1:]
            distances = distances[1:]
            item_scores = defaultdict(float)
            for sim_idx, dist in zip(similar_users, distances):
                sim_score = 1 - dist / 2
                user_items = self.user_item_matrix[sim_idx].indices
                for item_idx in user_items:
                    item_scores[item_idx] += sim_score
            user_known_items = set(self.user_item_matrix[user_idx].indices)
            top_items = sorted(item_scores.items(), key=lambda x: -x[1])[:k]
            # Для отладки: закомментировать фильтрацию
            # top_items = [item_idx for item_idx, _ in top_items if item_idx not in user_known_items]
            predictions[user_id] = [reverse_item_map[item_idx] for item_idx, _ in top_items] if top_items else popular_items[:k]
            if len(predictions[user_id]) == 0 and user_id == users[0]:
                print(f"Debug for user {user_id}: item_scores len: {len(item_scores)}, known_items len: {len(user_known_items)}, top_items before filter: {len(sorted(item_scores.items(), key=lambda x: -x[1])[:k])}")
        gc.collect()
        return predictions

In [8]:
class ItemKNN(BaseRecommender):
    def __init__(self, n_neighbors=20, n_trees=10):
        self.n_neighbors = n_neighbors
        self.n_trees = n_trees
        self.item_index = None
        self.user_item_matrix = None
        self.user_id_map = None
        self.item_id_map = None

    def fit(self, data: pl.DataFrame):
        unique_users = data['user_id'].unique().to_list()
        unique_items = data['item_id'].unique().to_list()
        self.user_id_map = {uid: idx for idx, uid in enumerate(unique_users)}
        self.item_id_map = {iid: idx for idx, iid in enumerate(unique_items)}
        print(f"Unique users: {len(unique_users)}, Unique items: {len(unique_items)}")

        rows = data['user_id'].map_elements(lambda u: self.user_id_map.get(u, -1), return_dtype=pl.Int32).to_numpy()
        cols = data['item_id'].map_elements(lambda i: self.item_id_map.get(i, -1), return_dtype=pl.Int32).to_numpy()
        valid_mask = (rows != -1) & (cols != -1)
        rows, cols = rows[valid_mask], cols[valid_mask]
        vals = np.ones(len(rows))
        self.user_item_matrix = csr_matrix((vals, (rows, cols)), shape=(len(unique_users), len(unique_items)))
        print(f"User-item matrix shape: {self.user_item_matrix.shape}")
        print(f"User-item matrix memory usage: {self.user_item_matrix.data.nbytes / 1024**2:.2f} MB")

        item_user_matrix = self.user_item_matrix.T
        n_features = item_user_matrix.shape[1]
        self.item_index = AnnoyIndex(n_features, 'angular')
        for item_idx in tqdm(range(len(unique_items)), desc="Building Annoy index for items"):
            item_vector = item_user_matrix[item_idx].toarray().flatten()
            self.item_index.add_item(item_idx, item_vector)
        self.item_index.build(self.n_trees)
        print("Annoy index for items built.")
        gc.collect()

    def predict(self, users: List[int], k: int = 10) -> Dict[int, List[int]]:
        global popular_items
        predictions = {}
        reverse_item_map = {v: k for k, v in self.item_id_map.items()}
        for user_id in tqdm(users, desc="Predicting with ItemKNN (Annoy)"):
            if user_id not in self.user_id_map:
                predictions[user_id] = popular_items[:k]
                continue
            user_idx = self.user_id_map[user_id]
            user_items = self.user_item_matrix[user_idx].indices
            item_scores = defaultdict(float)
            for item_idx in user_items:
                similar_items, distances = self.item_index.get_nns_by_item(item_idx, self.n_neighbors + 1, include_distances=True)
                similar_items = similar_items[1:]
                distances = distances[1:]
                for sim_idx, dist in zip(similar_items, distances):
                    sim_score = 1 - dist / 2
                    item_scores[sim_idx] += sim_score
            user_known_items = set(user_items)
            top_items = sorted(item_scores.items(), key=lambda x: -x[1])[:k]
            # Для отладки: закомментировать фильтрацию
            # top_items = [item_idx for item_idx, _ in top_items if item_idx not in user_known_items]
            predictions[user_id] = [reverse_item_map[item_idx] for item_idx, _ in top_items] if top_items else popular_items[:k]
            if len(predictions[user_id]) == 0 and user_id == users[0]:
                print(f"Debug for user {user_id}: item_scores len: {len(item_scores)}, known_items len: {len(user_known_items)}, top_items before filter: {len(sorted(item_scores.items(), key=lambda x: -x[1])[:k])}")
        gc.collect()
        return predictions

#### Вспомогательные функции для удобного представления результатов

Необходимы для составления табличек

In [9]:
def _shorten_list(lst, max_len=10):
    """Обрезает длинные списки для красивого вывода"""
    if lst is None:
        return []
    return lst[:max_len] if len(lst) > max_len else lst

def show_predictions(models: dict, data: pl.DataFrame, n=5, verbose=True, is_val=False):
    df = data.sample(n).select(["user_id", "item_id"])
    if is_val:
        df = df.rename({"item_id": "true_items"})

    # добавляем предсказания
    for name, preds in models.items():
        df = df.with_columns(
            pl.col("user_id").map_elements(
                lambda u: _shorten_list(preds.get(u, [])), 
                return_dtype=pl.List(pl.Int64)
            ).alias(name)
        )

    if verbose:
        print(df.shape)
        print(df)

    return df


def val_predictions(models: dict, val: pl.DataFrame, validator: Validator, k: int = 10, verbose: bool = True):
    results = []
    user2items = (
        val.group_by("user_id").agg(pl.col("item_id")).to_dict(as_series=False)
    )
    user2items = dict(zip(user2items["user_id"], user2items["item_id"]))

    for model_name, preds in models.items():
        recalls, precisions, hits, ndcgs, mrrs = [], [], [], [], []
        for u, y_true in user2items.items():
            y_pred = preds.get(u, [])
            recalls.append(validator.recall_at_k(y_true, y_pred, k))
            precisions.append(validator.precision_at_k(y_true, y_pred, k))
            hits.append(validator.hitrate_at_k(y_true, y_pred, k))
            ndcgs.append(validator.ndcg_at_k(y_true, y_pred, k))
            mrrs.append(validator.mrr_at_k(y_true, y_pred, k))
        metrics = {
            "model": model_name,
            "Recall@10": np.mean(recalls),
            "Precision@10": np.mean(precisions),
            "HitRate@10": np.mean(hits),
            "NDCG@10": np.mean(ndcgs),
            "MRR@10": np.mean(mrrs),
            "Coverage": validator.coverage(preds),
        }
        results.append(metrics)

    df = pl.DataFrame(results)
    if verbose:
        print(df)
    return df


### Pipeline

In [10]:
print(f"Train users: {train['user_id'].n_unique()}, items: {len(train_items)}")
print(f"Test users: {test['user_id'].n_unique()}, items: {len(test_items)}")
print(f"Cold items: {len(cold_items)}")

# Вычисляем популярные айтемы
popular_items = train_sample.group_by("item_id").agg(pl.col("user_id").count().alias("count")).sort("count", descending=True).head(10)["item_id"].to_list()
print(f"Popular items: {popular_items}")

# Ограничиваем user_ids общими пользователями
user_ids = list(common_users)
print(f"Using {len(user_ids)} common users for prediction")

# Диагностика sample пользователя
if user_ids:
    sample_user = user_ids[0]
else:
    sample_user = test["user_id"][0]
print(f"Sample user: {sample_user}")
print(f"In train_sample: {sample_user in train_sample['user_id']}")
print(f"Train interactions: {train_sample.filter(pl.col('user_id') == sample_user)['item_id'].to_list()}")
print(f"Test interactions: {test.filter(pl.col('user_id') == sample_user)['item_id'].to_list()}")

# Инициализируем и обучаем разные модели
user_knn = UserKNN(n_neighbors=5, n_trees=5)
user_knn.fit(train_sample)
pred_user_knn = user_knn.predict(user_ids, k=10)
user_knn.save_predictions(pred_user_knn, "../predictions/user_knn_predictions_sample.pkl")

item_knn = ItemKNN(n_neighbors=5, n_trees=5)
item_knn.fit(train_sample)
pred_item_knn = item_knn.predict(user_ids, k=10)
item_knn.save_predictions(pred_item_knn, "../predictions/item_knn_predictions_sample.pkl")

# Собираем предсказания в словарь
models = {
    "ItemKNN": pred_item_knn,
    "UserKNN": pred_user_knn,
}

print("\nРекомендации моделей:")
train_df = show_predictions(models, train, n=5, verbose=True)

Train users: 349719, items: 31300
Test users: 185828, items: 27367
Cold items: 1775
Popular items: [30197, 4058, 15514, 18150, 118, 13064, 33370, 15009, 960, 27745]
Using 80816 common users for prediction
Sample user: 6548c26be062f57a4105de9a5bfa358b
In train_sample: True
Train interactions: [3513, 13899, 28589, 31390, 28109, 18355, 12854, 24417]
Test interactions: [1948, 6061, 6201, 10024, 14425, 16848, 21421, 26498, 29110, 31820]
Unique users: 107586, Unique items: 16386
User-item matrix shape: (107586, 16386)
User-item matrix memory usage: 7.93 MB


Building Annoy index for users: 100%|██████████| 107586/107586 [01:06<00:00, 1624.34it/s]


Annoy index for users built.


Predicting with UserKNN (Annoy): 100%|██████████| 80816/80816 [00:03<00:00, 21945.30it/s]


Predictions saved to ../predictions/user_knn_predictions_sample.pkl (pickle format)
Predictions also saved to ../predictions/user_knn_predictions_sample.json (JSON format)
Unique users: 107586, Unique items: 16386
User-item matrix shape: (107586, 16386)
User-item matrix memory usage: 7.93 MB


Building Annoy index for items: 100%|██████████| 16386/16386 [01:33<00:00, 174.87it/s]


Annoy index for items built.


Predicting with ItemKNN (Annoy): 100%|██████████| 80816/80816 [01:54<00:00, 705.31it/s]


Predictions saved to ../predictions/item_knn_predictions_sample.pkl (pickle format)
Predictions also saved to ../predictions/item_knn_predictions_sample.json (JSON format)

Рекомендации моделей:
(5, 4)
shape: (5, 4)
┌─────────────────────────────────┬─────────┬────────────────────────┬────────────────────────┐
│ user_id                         ┆ item_id ┆ ItemKNN                ┆ UserKNN                │
│ ---                             ┆ ---     ┆ ---                    ┆ ---                    │
│ str                             ┆ i64     ┆ list[i64]              ┆ list[i64]              │
╞═════════════════════════════════╪═════════╪════════════════════════╪════════════════════════╡
│ 9e807829c237765ead132c67a7cbac… ┆ 19605   ┆ [30197, 4058, … 27745] ┆ [30197, 4058, … 27745] │
│ 7bf2aad1b2d5690aace6334ff99409… ┆ 4058    ┆ []                     ┆ []                     │
│ d9ffd88941bab3f6f89dcf41e5f2f7… ┆ 28438   ┆ []                     ┆ []                     │
│ f0e911936de211

In [11]:
print("\nСтатистика по метрикам для каждой модели:")
validator = JointValidator(train, test, cold_items)
metrics_df = val_predictions(models, test, validator, k=10, verbose=True)


Статистика по метрикам для каждой модели:
shape: (2, 7)
┌─────────┬───────────┬──────────────┬────────────┬──────────┬──────────┬──────────┐
│ model   ┆ Recall@10 ┆ Precision@10 ┆ HitRate@10 ┆ NDCG@10  ┆ MRR@10   ┆ Coverage │
│ ---     ┆ ---       ┆ ---          ┆ ---        ┆ ---      ┆ ---      ┆ ---      │
│ str     ┆ f64       ┆ f64          ┆ f64        ┆ f64      ┆ f64      ┆ f64      │
╞═════════╪═══════════╪══════════════╪════════════╪══════════╪══════════╪══════════╡
│ ItemKNN ┆ 0.007577  ┆ 0.011601     ┆ 0.08143    ┆ 0.013184 ┆ 0.030187 ┆ 0.000319 │
│ UserKNN ┆ 0.007577  ┆ 0.011601     ┆ 0.08143    ┆ 0.013184 ┆ 0.030187 ┆ 0.000319 │
└─────────┴───────────┴──────────────┴────────────┴──────────┴──────────┴──────────┘


In [12]:
train_users = set(train_sample["user_id"].unique())
test_users = set(test["user_id"].unique())
common_users = train_users.intersection(test_users)
print(f"Common users between train_sample and test: {len(common_users)}")
print(f"Percentage of test users in train_sample: {len(common_users) / len(test_users) * 100:.2f}%")

Common users between train_sample and test: 80816
Percentage of test users in train_sample: 43.49%
