# First version of recommendation model

### Links

Tutorial: [link](https://www.stepbystepdatascience.com/hybrid-recommender-lightfm-python)

Dataset: [instacart-market-basket-analysis](https://www.kaggle.com/datasets/psparks/instacart-market-basket-analysis)

### Prerequisites

Download dataset and place files in folder: ../data/instacart-market-basket-analysis

In [1]:
%pip install --upgrade pip setuptools wheel
%pip install numpy pandas scikit-learn scipy unidecode optuna plotly nbformat
%pip install --no-use-pep517 lightfm # https://github.com/lyst/lightfm/issues/687#issuecomment-1523956355

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import scipy
from os import path
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from unidecode import unidecode # to deal with accents
import pickle
import optuna

import sklearn

sklearn.show_versions()




System:
    python: 3.11.6 (main, Nov  2 2023, 04:39:40) [Clang 14.0.0 (clang-1400.0.29.202)]
executable: /Users/alv.popov/prj/grifon/recommendation/learning/venv/bin/python3.11
   machine: macOS-14.1.2-arm64-arm-64bit

Python dependencies:
      sklearn: 1.4.1.post1
          pip: 24.0
   setuptools: 69.2.0
        numpy: 1.26.4
        scipy: 1.12.0
       Cython: None
       pandas: 2.2.1
   matplotlib: None
       joblib: 1.3.2
threadpoolctl: 3.4.0

Built with OpenMP: True

threadpoolctl info:
       user_api: blas
   internal_api: openblas
    num_threads: 10
         prefix: libopenblas
       filepath: /Users/alv.popov/prj/grifon/recommendation/learning/venv/lib/python3.11/site-packages/numpy/.dylibs/libopenblas64_.0.dylib
        version: 0.3.23.dev
threading_layer: pthreads
   architecture: armv8

       user_api: blas
   internal_api: openblas
    num_threads: 10
         prefix: libopenblas
       filepath: /Users/alv.popov/prj/grifon/recommendation/learning/venv/lib/python

In [3]:
data_path = '../data'
dataset_path = path.join(data_path, 'instacart-market-basket-analysis')

# Маппинг user_id <-> order_id
raw_orders = pd.read_csv(path.join(dataset_path, 'orders.csv'))

# Продукты
raw_products = pd.read_csv(path.join(dataset_path, 'products.csv'))

# Маппинг order_id -> product_id
raw_order_products = pd.concat([pd.read_csv(path.join(dataset_path, 'order_products__prior.csv')),
                            pd.read_csv(path.join(dataset_path, 'order_products__train.csv'))])

# Фичи продуктов
aisles = pd.read_csv(path.join(dataset_path, 'aisles.csv'))
departments = pd.read_csv(path.join(dataset_path, 'departments.csv'))

# Чеки
raw_orders[(raw_orders["user_id"]==1)].sort_values(["order_number"])

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [4]:
# Количество юзеров
raw_orders['user_id'].unique().size

206209

In [5]:
# Айтемы
raw_products

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


In [6]:
# Взаимодействия
raw_order_products

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


In [7]:
# Подрежем количество продуктов, возьмем пока топ 1000
top_popular_product_counts = raw_order_products.groupby('product_id').size().sort_values()[-100:].rename('count').to_frame().reset_index()
top_popular_product_counts

Unnamed: 0,product_id,count
0,21709,34211
1,20995,34248
2,33000,35384
3,7781,35551
4,41220,36028
...,...,...
95,47209,220877
96,21903,251705
97,21137,275577
98,13176,394930


In [8]:
products = top_popular_product_counts.merge(raw_products, how='left', on='product_id')
products

Unnamed: 0,product_id,count,product_name,aisle_id,department_id
0,21709,34211,Sparkling Lemon Water,115,7
1,20995,34248,Organic Broccoli Florets,116,1
2,33000,35384,Pure Irish Butter,36,16
3,7781,35551,Organic Sticks Low Moisture Part Skim Mozzarel...,21,16
4,41220,36028,Organic Romaine Lettuce,83,4
...,...,...,...,...,...
95,47209,220877,Organic Hass Avocado,24,4
96,21903,251705,Organic Baby Spinach,123,4
97,21137,275577,Organic Strawberries,24,4
98,13176,394930,Bag of Organic Bananas,24,4


In [9]:
order_products = products[['product_id']].merge(raw_order_products, how='left', on='product_id')
order_products

Unnamed: 0,product_id,order_id,add_to_cart_order,reordered
0,21709,225,3,1
1,21709,305,3,1
2,21709,359,9,1
3,21709,371,4,1
4,21709,403,10,0
...,...,...,...,...
7795466,24852,3419531,2,1
7795467,24852,3419542,6,0
7795468,24852,3419629,5,1
7795469,24852,3420088,9,1


In [10]:
orders = raw_orders[raw_orders['order_id'].isin(order_products['order_id'].unique())]
orders

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [11]:
user_ids = orders["user_id"].unique()
product_ids = products["product_id"].unique()

dataset = Dataset()
dataset.fit(user_ids, product_ids)

user_mappings = dataset.mapping()[0]
item_mappings = dataset.mapping()[2]

len(user_mappings), len(item_mappings), list(user_mappings.items())[:5]

(194336, 100, [(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)])

In [12]:
# Обратная матрица взаимодействий
inv_user_mappings = {v:k for k, v in user_mappings.items()}
inv_item_mappings = {v:k for k, v in item_mappings.items()}
list(inv_item_mappings.items())[:5]

[(0, 21709), (1, 20995), (2, 33000), (3, 7781), (4, 41220)]

In [13]:
# Создаем матрицу взаимодействий
data = orders.merge(order_products, how='inner', on='order_id').groupby(['user_id', 'product_id']).size()
data = data.to_frame().reset_index().rename(columns={0: 'weight'})
interactions, weights = dataset.build_interactions(data[['user_id', 'product_id', 'weight']].values)
interactions, weights

(<194336x100 sparse matrix of type '<class 'numpy.int32'>'
 	with 2221302 stored elements in COOrdinate format>,
 <194336x100 sparse matrix of type '<class 'numpy.float32'>'
 	with 2221302 stored elements in COOrdinate format>)

In [14]:
interactions.todense(), weights.todense()

(matrix([[0, 0, 0, ..., 0, 1, 0],
         [1, 0, 0, ..., 0, 1, 1],
         [0, 0, 0, ..., 1, 0, 0],
         ...,
         [0, 1, 0, ..., 1, 1, 0],
         [0, 1, 1, ..., 1, 1, 0],
         [0, 0, 0, ..., 0, 1, 1]], dtype=int32),
 matrix([[ 0.,  0.,  0., ...,  0.,  2.,  0.],
         [ 2.,  0.,  0., ...,  0.,  1.,  8.],
         [ 0.,  0.,  0., ...,  1.,  0.,  0.],
         ...,
         [ 0.,  4.,  0., ...,  1.,  8.,  0.],
         [ 0., 14., 21., ..., 15., 22.,  0.],
         [ 0.,  0.,  0., ...,  0.,  6.,  5.]], dtype=float32))

In [15]:
def evaluate_model(model):
    train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

    for metric in [precision_at_k, recall_at_k]:        
        print(f"{metric.__name__}:")
        print(f"\t- train: %.3f" % metric(model, train, k=5).mean())
        print(f"\t- test: %.3f" % metric(model, test, k=5).mean())
        print(f"\t- test new: %.3f" % metric(model, test, train, k=5).mean())

In [16]:
def objective(trial):

    train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

    param = {
        'no_components': trial.suggest_int("no_components", 5, 64),
        "learning_schedule": trial.suggest_categorical("learning_schedule", ["adagrad", "adadelta"]),
        "loss":  trial.suggest_categorical("loss", ["bpr", "warp", "warp-kos"]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 1),
        "item_alpha": trial.suggest_float("item_alpha", 1e-10, 1e-06, log=True),
        "user_alpha": trial.suggest_float("user_alpha", 1e-10, 1e-06, log=True), 
        "max_sampled": trial.suggest_int("max_sampled", 5, 15),
    }
    epochs = trial.suggest_int("epochs", 20, 50)

    model = LightFM(**param, random_state=123)
    model.fit(train, epochs=epochs, verbose=True)

    val_precision = precision_at_k(model, test, k=5).mean()

    return val_precision

study = optuna.create_study(direction="maximize")

study.enqueue_trial(params={
    "no_components": 27, 
    "learning_schedule": 'adagrad', 
    "loss": 'warp',
    "learning_rate": 0.15,
    "item_alpha": 1e-10, 
    "user_alpha": 1e-8, 
    "max_sampled": 15,
    "epochs": 30,
})

study.optimize(objective, n_trials=50)

best_params = study.best_params
for k, v in best_params.items():
    print(k, ":", v)

optuna.importance.get_param_importances(study)

[I 2024-04-01 05:20:26,668] A new study created in memory with name: no-name-8dc85b50-e320-473e-8fca-e0fc99b0123b
Epoch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:50<00:00,  1.68s/it]
[I 2024-04-01 05:21:17,760] Trial 0 finished with value: 0.01078157126903534 and parameters: {'no_components': 27, 'learning_schedule': 'adagrad', 'loss': 'warp', 'learning_rate': 0.15, 'item_alpha': 1e-10, 'user_alpha': 1e-08, 'max_sampled': 15, 'epochs': 30}. Best is trial 0 with value: 0.01078157126903534.
Epoch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [01:47<00:00,  2.39s/it]
[I 2024-04-01 05:23:05,985] Trial 1 finished with value: 0.004687521606683731 and parameters: {'no_components': 41, 'learning_schedule': 'adagrad', 'loss': 'bpr', 'learning_rate':

no_components : 5
learning_schedule : adagrad
loss : warp
learning_rate : 0.7510216830421954
item_alpha : 6.201256477126263e-09
user_alpha : 9.592173442318563e-09
max_sampled : 12
epochs : 46


{'no_components': 0.7439155645124513,
 'learning_rate': 0.1198586398843038,
 'loss': 0.04274122323049125,
 'epochs': 0.03958569244936504,
 'max_sampled': 0.038661002616642674,
 'learning_schedule': 0.009014772485254851,
 'user_alpha': 0.0032080572477470796,
 'item_alpha': 0.0030150475737440734}

In [17]:
if 'epochs' in best_params:
    num_epochs = best_params['epochs']
    del best_params['epochs']

model = LightFM(**best_params, random_state=123)

model.fit(
    interactions,
    epochs=num_epochs,
    verbose=True
)

evaluate_model(model)

Epoch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:51<00:00,  1.11s/it]


precision_at_k:
	- train: 0.189
	- test: 0.062
	- test new: 0.088
recall_at_k:
	- train: 0.139
	- test: 0.121
	- test new: 0.154


In [18]:
# Сравним с топ 5 рекомендацией
# todo сделать подсчет метрики для новых пользователей

k = 5
static_prediction = top_popular_product_counts[-k:]['product_id']
static_prediction_ind = list(map(lambda x: item_mappings[x], static_prediction))

def static_precision_at_k(interactions, suppress_interactions=None):
    interactions = interactions.copy()
    interactions.data = np.isin(interactions.col, static_prediction_ind)
    hit = np.squeeze(np.array(interactions.sum(axis=1)))
    return hit / k

def static_recall_at_k(interactions, suppress_interactions=None):
    retrieved = np.squeeze(interactions.getnnz(axis=1))

    interactions = interactions.copy()
    interactions.data = np.isin(interactions.col, static_prediction_ind)
    hit = np.squeeze(np.array(interactions.sum(axis=1)))

    hit = hit[retrieved > 0]
    retrieved = retrieved[retrieved > 0]

    return hit / retrieved


train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)
for metric in [static_precision_at_k, static_recall_at_k]:        
    print(f"{metric.__name__}:")
    print(f"\t- train: %.3f" % metric(train).mean())
    print(f"\t- test: %.3f" % metric(test).mean())
    # print(f"\t- test new: %.3f" % metric(test, train).mean())

static_precision_at_k:
	- train: 0.250
	- test: 0.063
static_recall_at_k:
	- train: 0.165
	- test: 0.155


In [19]:
# Пример инференса

class Model:

    def __init__(
        self,
        model,
        dataset,
    ):
        self.model = model
        user_mappings, _, item_mappings, _ = dataset.mapping()
        self.internal_user_ids = list(user_mappings.values())
        self.internal_product_ids = list(item_mappings.values())
        self.inv_item_mappings = {v: k for k, v in item_mappings.items()}
        self.rng = np.random.default_rng()

    def get_recommendations(self):
        similar_user_id = int(self.rng.choice(self.internal_user_ids))
        prediction = self.model.predict(
            user_ids=similar_user_id,
            item_ids=self.internal_product_ids,
        )
        prediction = np.argsort(prediction)[-1:-6:-1]
        prediction = [self.inv_item_mappings[x] for x in prediction]
        return prediction

recs_model = Model(model, dataset)
recs_product_ids = recs_model.get_recommendations()
recs_product_ids

[5785, 24852, 13176, 21137, 21903]

In [20]:
%%timeit
recs_model.get_recommendations()

5.05 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
recs = products[products['product_id'].isin(recs_product_ids)]
recs

Unnamed: 0,product_id,count,product_name,aisle_id,department_id
36,5785,49374,Organic Reduced Fat 2% Milk,84,16
96,21903,251705,Organic Baby Spinach,123,4
97,21137,275577,Organic Strawberries,24,4
98,13176,394930,Bag of Organic Bananas,24,4
99,24852,491291,Banana,24,4


In [22]:
# Сохраним модель
with open('../artifacts/v0.0.1', 'wb') as file:
    pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)