## Курсовой проект по теме "Рекомендательные системы"

In [1]:
# Установка библиотек
!pip install lightfm
!pip install implicit

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=879167 sha256=23606b73068885f579d114c07151ab536f4f841273755f436e6e79ad763b3070
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
Collecting implicit
  Downloading implicit-0.7.0-cp310-cp310-manylinux2014_x86_64.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
sys.path.append('/content/drive/MyDrive/Rec_systems')
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender



In [4]:
data = pd.read_csv('/content/drive/MyDrive/Rec_systems/retail_train.csv')
item_features = pd.read_csv('/content/drive/MyDrive/Rec_systems/product.csv')
user_features = pd.read_csv('/content/drive/MyDrive/Rec_systems/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель --
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [6]:
param_grid = {
              'n_factors': [10, 20, 30],
              'regularization': [0.01, 0.05, 0.001],
              'iterations':[5, 10, 15],
              'n_factors': [10, 20],
              'num_threads': [0]
              }

params_df = pd.DataFrame([], columns=['n_factors', 'regularization', 'iterations', 'num_threads', 'recall_value'])

keys, values = zip(*param_grid.items())
for v in itertools.product(*values):
    params = dict(zip(keys, v))
    recommender = MainRecommender(data_train_lvl_1, model_attrs=params)
    params['recall_value'] = round(recommender.get_recall_at_k(data_val_lvl_1, k=5), 4)
    print(params)
    params_df = params_df.append(params, ignore_index=True)

params_df = params_df.astype({'n_factors':'int', 'regularization':'float', 'iterations':'int', 'num_threads':'int', 'recall_value':'float'})

  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.01, 'iterations': 5, 'num_threads': 0, 'recall_value': 0.0199}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.01, 'iterations': 10, 'num_threads': 0, 'recall_value': 0.0207}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.01, 'iterations': 15, 'num_threads': 0, 'recall_value': 0.0193}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.05, 'iterations': 5, 'num_threads': 0, 'recall_value': 0.0194}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.05, 'iterations': 10, 'num_threads': 0, 'recall_value': 0.0199}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.05, 'iterations': 15, 'num_threads': 0, 'recall_value': 0.0206}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.001, 'iterations': 5, 'num_threads': 0, 'recall_value': 0.0191}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.001, 'iterations': 10, 'num_threads': 0, 'recall_value': 0.0196}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 10, 'regularization': 0.001, 'iterations': 15, 'num_threads': 0, 'recall_value': 0.0205}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.01, 'iterations': 5, 'num_threads': 0, 'recall_value': 0.0194}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.01, 'iterations': 10, 'num_threads': 0, 'recall_value': 0.0197}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.01, 'iterations': 15, 'num_threads': 0, 'recall_value': 0.02}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.05, 'iterations': 5, 'num_threads': 0, 'recall_value': 0.0195}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.05, 'iterations': 10, 'num_threads': 0, 'recall_value': 0.0198}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.05, 'iterations': 15, 'num_threads': 0, 'recall_value': 0.02}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.001, 'iterations': 5, 'num_threads': 0, 'recall_value': 0.0193}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.001, 'iterations': 10, 'num_threads': 0, 'recall_value': 0.0198}


  params_df = params_df.append(params, ignore_index=True)


  0%|          | 0/5001 [00:00<?, ?it/s]

{'n_factors': 20, 'regularization': 0.001, 'iterations': 15, 'num_threads': 0, 'recall_value': 0.0196}


  params_df = params_df.append(params, ignore_index=True)


In [24]:
lvl_1_recommender = MainRecommender(data_train_lvl_1,
                              model_attrs=params_df.sort_values(by='recall_value', ascending=False).drop(columns=['recall_value']).to_dict(orient='records')[0])

  0%|          | 0/5001 [00:00<?, ?it/s]

In [7]:
# Модель второго уровня
class LGBMRecommender():
  def __init__(self, data, candidates, user_features, item_features):

    self.data = data
    self.candidates = candidates
    self.user_features = user_features
    self.item_features = item_features

    self.X_train, self.y_train = self._prepare_dataset(self.data, self.candidates)

    self.cat_feats = self.X_train.columns[2:].tolist()
    self.X_train[self.cat_feats] = self.X_train[self.cat_feats].astype('category')
    self.model = self.fit(self.X_train, self.y_train, self.cat_feats)

  def _prepare_dataset(self, data, candidates):
    s = candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'
    candidates = candidates.drop('candidates', axis=1).join(s)

    targets = data.groupby(['user_id','item_id']).first().copy() # убираем дубликаты сочетаний user-item
    targets['target'] = 1  # тут только покупки

    targets = candidates.merge(targets, on=['user_id', 'item_id'], how='left')

    targets['target'].fillna(0, inplace= True)
    targets = targets.merge(self.item_features, on='item_id', how='left')
    targets = targets.merge(self.user_features, on='user_id', how='left')

    #Добавляем новые фичи
    targets = targets.merge(targets.groupby(['user_id', 'commodity_desc'])['sales_value'].mean().rename('mean_commodity_value').reset_index(), on=['user_id','commodity_desc'], how='left')
    targets = targets.merge(targets.groupby(['user_id', 'manufacturer'])['quantity'].mean().rename('mean_manufacturer_value').reset_index(), on=['user_id','manufacturer'], how='left')
    targets = targets.merge(targets.groupby(['week_no', 'item_id'])['sales_value'].mean().rename('mean_week_value').reset_index(), on=['week_no','item_id'], how='left')
    targets = targets.merge(targets.groupby(['store_id', 'item_id'])['sales_value'].mean().rename('mean_store_value').reset_index(), on=['store_id', 'item_id'], how='left')
    targets = targets.merge(targets.groupby(['user_id', 'basket_id'])['sales_value'].mean().rename('mean_basket_value').reset_index(), on=['user_id','basket_id'], how='left')
    targets = targets.merge(targets.groupby('user_id')['sales_value'].mean().rename('mean_check').reset_index(), how='left', on='user_id')

    X_train = targets.drop('target', axis=1)
    y_train = targets[['target']]

    return X_train, y_train

  @staticmethod
  def fit(X_train, y_train, cat_feats):
    model = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
    model.fit(X_train, y_train)
    return model

  def _get_recommendations(self, data, user_id, N=5):
    one_user = data[(data['user_id'] == user_id)]
    one_user['preds'] = self.model.predict_proba(data[(data['user_id'] == user_id)])[:,1]
    one_user = one_user[['item_id', 'preds']]
    recs = one_user.sort_values(by='preds', ascending=False)[:N]['item_id'].tolist()
    return recs

  def get_recall_at_k(self, val_data, candidates, k=5):
    val_data = self._prepare_dataset(val_data, candidates)[0]
    result = val_data.groupby('user_id')['item_id'].unique().reset_index()
    result.columns=['user_id', 'actual']
    result = result[result['user_id'].isin(self.data['user_id'])]
    result['pred'] = result.apply(lambda x: self._get_recommendations(self.X_train, x['user_id'], N=k), axis=1)
    recall_value = result.apply(lambda x: recall_at_k(x['pred'], x['actual'], k=k), axis=1).mean()

    return recall_value

  def get_precision_at_k(self, val_data, candidates, k=5):
    val_data = self._prepare_dataset(val_data, candidates)[0]
    result = val_data.groupby('user_id')['item_id'].unique().reset_index()
    result.columns=['user_id', 'actual']
    result = result[result['user_id'].isin(self.data['user_id'])]
    result['pred'] = result.apply(lambda x: self._get_recommendations(self.X_train, x['user_id'], N=k), axis=1)
    recall_value = result.apply(lambda x: precision_at_k(x['pred'], x['actual'], k=k), axis=1).mean()

    return recall_value

In [8]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
users_lvl_1 = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(users_lvl_1)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

In [31]:
val_users_lvl_2 = pd.DataFrame(data_val_lvl_2['user_id'].unique())
val_users_lvl_2.columns = ['user_id']

val_users_lvl_2 = val_users_lvl_2[val_users_lvl_2['user_id'].isin(users_lvl_1)]

val_users_lvl_2['candidates'] = val_users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

In [28]:
lvl_2_recommender = LGBMRecommender(data_train_lvl_2, users_lvl_2, user_features, item_features)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [11]:
lvl_2_recommender.get_recall_at_k(data_val_lvl_2, val_users_lvl_2, k=5)

0.10000000000000002

In [29]:
lvl_1_recommender.get_recall_at_k(data_val_lvl_2, k=5)

0.018999055950204857

In [32]:
lvl_2_recommender.get_precision_at_k(data_val_lvl_2, val_users_lvl_2, k=5)

1.0

In [33]:
lvl_1_recommender.get_precision_at_k(data_val_lvl_2, k=5)

0.14078431372549022