In [1]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from collections import Counter

from scipy import sparse

## Подготовка данных

In [4]:
df_raw = pd.read_csv("path_to_dataset", index_col=0)
users = df_raw["user_id"].unique()
users = np.random.choice(users, len(users) // 10)
df_raw = df_raw[df_raw["user_id"].isin(users)]

In [5]:
threshold_subj2subj = 25
threshold_als = 30
user_counts = df_raw["user_id"].value_counts()
user_mask_bayes = df_raw['user_id'].isin(user_counts[threshold_subj2subj > user_counts].index)
user_mask_subj2subj = df_raw['user_id'].isin(user_counts[(threshold_als > user_counts) & (user_counts >= threshold_subj2subj)].index)
user_mask_als = df_raw['user_id'].isin(user_counts[user_counts >= threshold_als].index)

In [6]:
df_bayes = df_raw[user_mask_bayes]
df_subj2subj = df_raw[user_mask_subj2subj]
df_als = df_raw[user_mask_als]

In [7]:
unique_customers_als = df_als["user_id"].unique()
cust_ids_als = dict(zip(unique_customers_als, np.arange(unique_customers_als.shape[0], dtype=np.int32)))
reverse_cust_ids_als = dict(zip(np.arange(unique_customers_als.shape[0], dtype=np.int32), unique_customers_als))

unique_items_als = df_als["nm_id"].unique()
item_ids_als = dict(zip(unique_items_als, np.arange(unique_items_als.shape[0], dtype=np.int32)))
reverse_item_ids_als = dict(zip(np.arange(unique_items_als.shape[0], dtype=np.int32), unique_items_als))

unique_customers_subj2subj = df_subj2subj["user_id"].unique()
cust_ids_subj2subj = dict(zip(unique_customers_subj2subj, np.arange(unique_customers_subj2subj.shape[0], dtype=np.int32)))
reverse_cust_ids_subj2subj = dict(zip(np.arange(unique_customers_subj2subj.shape[0], dtype=np.int32), unique_customers_subj2subj))

unique_subjects_subj2subj = df_subj2subj["subject_id"].unique()
subject_ids_subj2subj = dict(zip(unique_subjects_subj2subj, np.arange(unique_subjects_subj2subj.shape[0], dtype=np.int32)))

In [8]:
def enumerate_cust_item_als(df_filter_cust_item):
    '''нумерует пользователей, товары и категории по порядку'''
    df_filter_cust_item["cust_id"] = df_filter_cust_item["user_id"].apply(
        lambda i: cust_ids_als[i]
    )
    df_filter_cust_item["item_id"] = df_filter_cust_item["nm_id"].apply(
        lambda i: item_ids_als[i]
    )
    return df_filter_cust_item

In [9]:
df_als = enumerate_cust_item_als(df_als)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filter_cust_item["cust_id"] = df_filter_cust_item["user_id"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filter_cust_item["item_id"] = df_filter_cust_item["nm_id"].apply(


In [10]:
def enumerate_cust_subj_subj2subj(df_filter_cust_item):
    '''нумерует пользователей, товары и категории по порядку'''
    df_filter_cust_item["cust_id"] = df_filter_cust_item["user_id"].apply(
        lambda i: cust_ids_subj2subj[i]
    )
    df_filter_cust_item["subj_id"] = df_filter_cust_item["subject_id"].apply(
        lambda i: subject_ids_subj2subj[i]
    )
    return df_filter_cust_item

In [11]:
df_subj2subj = enumerate_cust_subj_subj2subj(df_subj2subj)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filter_cust_item["cust_id"] = df_filter_cust_item["user_id"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filter_cust_item["subj_id"] = df_filter_cust_item["subject_id"].apply(


In [12]:
def get_user2item_sparse_matrix(df_filter_cust_item, shape):
    '''возвращает разреженную матрицу user2item по уже отфильтрованному датафрейму'''
    df_cust_item_qty = (
        df_filter_cust_item.groupby(["cust_id", "item_id"])
        .agg("count")
        .reset_index()
    )
    sparse_customer_item = sparse.csr_matrix((df_cust_item_qty["nm_id"].astype(float), (df_cust_item_qty["cust_id"], df_cust_item_qty["item_id"])), shape=shape)
    return sparse_customer_item

In [13]:
def get_user2subject_sparse_matrix(df_filter_cust_item, shape):
    '''возвращает разреженную матрицу user2subject по уже отфильтрованному датафрейму'''
    df_cust_item_qty = (
        df_filter_cust_item.groupby(["cust_id", "subj_id"])
        .agg("count")
        .reset_index()
    )
    sparse_customer_item = sparse.csr_matrix((df_cust_item_qty["subject_id"].astype(float), (df_cust_item_qty["cust_id"], df_cust_item_qty["subj_id"])), shape=shape)
    return sparse_customer_item

In [14]:
shape = (len(unique_customers_als), len(unique_items_als))
sparse_customer_item = get_user2item_sparse_matrix(df_als, shape)

In [16]:
shape = (len(unique_customers_subj2subj), len(unique_subjects_subj2subj))
sparse_customer_subj = get_user2subject_sparse_matrix(df_subj2subj, shape)

In [17]:
k = 200

## Bias prediction

In [18]:
submission1_user = np.array(df_bayes["user_id"].unique())

In [19]:
class Bayes:
    def __init__(self):
        pass
    def fit(self, X_train):
        self.top_items = X_train["nm_id"].value_counts().keys()
    def predict(self, n, k):
        return np.array([self.top_items[:k] for i in range(n)])

In [20]:
bayes_model = Bayes()
bayes_model.fit(df_bayes)
bayes_preds = bayes_model.predict(len(submission1_user), k)

In [21]:
submission1 = pd.DataFrame({'user_id': submission1_user, 'recommendation' : [x for x in bayes_preds]})

## AlternatingLeastSquares

In [22]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [26]:
from implicit.als import AlternatingLeastSquares

class ALS:
    def __init__(self, factors, regularization, alpha, iterations):
        self.model = AlternatingLeastSquares(factors=factors, regularization=regularization,
                                                alpha=alpha, iterations=iterations, num_threads=4,
                                                calculate_training_loss=True, random_state=42)
    def fit(self, sparse_customer_item_train):
        self.model.fit(sparse_customer_item_train)
        
    def predict(self, sparse_customer_item_test, test_cust, k, filter_already_liked_items):
        preds, score = self.model.recommend(test_cust, sparse_customer_item_test[test_cust], N=k, filter_already_liked_items=filter_already_liked_items)
        preds = np.array([np.array([reverse_item_ids_als[item] for item in user]) for user in preds])
        return preds, score

In [27]:
als_model = ALS(factors=64, regularization=0.05, alpha=2, iterations=5)
als_model.fit(sparse_customer_item)
als_preds, als_score = als_model.predict(sparse_customer_item, np.arange(unique_customers_als.shape[0], dtype=np.int32), k, filter_already_liked_items=False)

  0%|          | 0/5 [00:00<?, ?it/s]

In [28]:
submission2_user = unique_customers_als
submission2 = pd.DataFrame({'user_id': submission2_user, 'recommendation' : [x for x in als_preds]})

## Subject2Subject

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

class Subject2Subject:
    def __init__(self):
        pass
    def fit(self, train, sparse_customer_subject_train):
        subject_similarity = cosine_similarity(sparse_customer_subject_train.T)
        self.sparse_customer_subject_train = sparse_customer_subject_train
        self.sparse_subject_similarity = sparse.csr_matrix(subject_similarity)
        self.subject_cnt = defaultdict(lambda: Counter())

        for idx, row in train.iterrows():
            self.subject_cnt[row["subj_id"]][row["nm_id"]] += 1
            
    def predict(self, test_cust, k, num_subjects, items_in_subject):
        ratings = self.sparse_customer_subject_train[test_cust] @ self.sparse_subject_similarity
        recommended_subjects = np.flip(np.argsort(ratings.toarray()), axis=1)[:,:num_subjects]
        preds = []

        for user_subj_recs in recommended_subjects:
            user_item_recs = np.array([], dtype=np.int64)
            for subj in user_subj_recs:
                if len(user_item_recs) >= k:
                    break
                item_recs = [i[0] for i in self.subject_cnt[subj].most_common(items_in_subject)]
                user_item_recs = np.append(user_item_recs, item_recs)
            user_item_recs = user_item_recs[:k]
            preds.append(user_item_recs)
        return preds

In [30]:
sub2sub_model = Subject2Subject()
sub2sub_model.fit(df_subj2subj, sparse_customer_subj)
sub2sub_preds = sub2sub_model.predict(np.arange(unique_customers_subj2subj.shape[0], dtype=np.int32), k, num_subjects=100, items_in_subject=10)

In [31]:
submission3_user = unique_customers_subj2subj
submission3 = pd.DataFrame({'user_id': submission3_user, 'recommendation' : [x for x in sub2sub_preds]})

In [32]:
submission = pd.concat([submission1, submission2, submission3], axis=0).reset_index(drop=True)

## Popularity_Ranker

In [34]:
class Popularity_Ranker:
    def __init__(self, k=10):
        """
        :param k: Количество товаров, которое необходимо возвращать для каждого пользователя.
        """
        self.popularity_dict = None
        self.k = k

    def fit(self, train_data):
        """
        Подготавливает словарь популярности товаров на основе обучающих данных.
        :param train_data: DataFrame с данными обучения, содержащий столбец 'item_id' с ID товаров.
        """
        self.popularity_dict = train_data.groupby('nm_id').size().to_dict()

    def rank(self, recommendations):
        """
        Ранжирует список рекомендованных товаров на основе их популярности.

        :param recommendations: Список ID товаров для ранжирования.
        :return: Отсортированный список топ-k самых популярных товаров.
        """
        sorted_recommendations = sorted(recommendations, key=lambda x: self.popularity_dict.get(x, 0), reverse=True)
        return sorted_recommendations[:self.k]

    def rank_users(self, user_recommendations):
        """
        Применяет ранжирование к спискам рекомендаций для нескольких пользователей.

        :param user_recommendations: Словарь, где ключи - ID пользователей, а значения - списки рекомендаций.
        :return: Словарь с отранжированными списками топ-k товаров для каждого пользователя.
        """
        predictions = {}
        for user, recommendations in user_recommendations.items():
            predictions[user] = self.rank(recommendations)
        return [predictions[cust] for cust in user_recommendations.keys()]

In [35]:
ranker = Popularity_Ranker(k)
ranker.fit(df_raw)
user2rec = submission.set_index("user_id").to_dict()["recommendation"]
ranked_recommendations = ranker.rank_users(user2rec)

In [37]:
submission["recommendation"] = ranked_recommendations

In [None]:
submission.to_csv("submission.csv")