In [1]:
import pickle
import sys
from os import path

import numpy as np
import pandas as pd
import optuna
import torch
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import precision_recall_score

sys.path.insert(0, '..')  # Важно чтобы работали импорты из корня репозитория как в инференсе

# Вместо sklearn.show_versions() мы можем проверить версии других библиотек
print(torch.__version__)
print(pd.__version__)
print(np.__version__)

2.2.2+cu121
2.2.2
1.26.4


In [2]:
import pandas as pd
from os import path
import numpy as np
from spotlight.interactions import Interactions

data_path = '../data'
dataset_path = path.join(data_path, 'instacart-market-basket-analysis')

# Загружаем данные
raw_orders = pd.read_csv(path.join(dataset_path, 'orders.csv'))
raw_products = pd.read_csv(path.join(dataset_path, 'products.csv'))
raw_order_products = pd.concat([
    pd.read_csv(path.join(dataset_path, 'order_products__prior.csv')),
    pd.read_csv(path.join(dataset_path, 'order_products__train.csv'))
])

# Смотрим заказы одного пользователя для примера
print(raw_orders[(raw_orders["user_id"] == 1)].sort_values(["order_number"]))

# Получаем топ-20 популярных продуктов
top_popular_product_counts = raw_order_products.groupby('product_id').size().sort_values()[-2500:].rename('count').reset_index()
products = top_popular_product_counts.merge(raw_products, on='product_id')
order_products = products[['product_id']].merge(raw_order_products, on='product_id')
orders = raw_orders[raw_orders['order_id'].isin(order_products['order_id'].unique())]

# Создаем маппинги
user_id_mapping = {user_id: i for i, user_id in enumerate(orders['user_id'].unique())}
product_id_mapping = {product_id: i for i, product_id in enumerate(products['product_id'].unique())}

# Преобразовываем user_id и product_id с использованием маппинга
grouped_data = orders.merge(order_products, on='order_id').groupby(['user_id', 'product_id']).size()
grouped_data = grouped_data.to_frame().reset_index().rename(columns={0: 'weight'})
grouped_data['user_id'] = grouped_data['user_id'].map(user_id_mapping)
grouped_data['product_id'] = grouped_data['product_id'].map(product_id_mapping)

# Создаем объект Interactions с весами
user_ids = np.array(grouped_data['user_id'], dtype=np.int32)
product_ids = np.array(grouped_data['product_id'], dtype=np.int32)
weights = np.array(grouped_data['weight'], dtype=np.float32)
timestamps = np.zeros(len(user_ids), dtype=np.int32)  # Подставляем заглушку для timestamps

interactions = Interactions(user_ids, product_ids, timestamps, weights)

# Проверка
print(f'Number of unique users: {len(np.unique(user_ids))}')
print(f'Number of unique items: {len(np.unique(product_ids))}')


    order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0    2539329        1    prior             1          2                  8   
1    2398795        1    prior             2          3                  7   
2     473747        1    prior             3          3                 12   
3    2254736        1    prior             4          4                  7   
4     431534        1    prior             5          4                 15   
5    3367565        1    prior             6          2                  7   
6     550135        1    prior             7          1                  9   
7    3108588        1    prior             8          1                 14   
8    2295261        1    prior             9          1                 16   
9    2550362        1    prior            10          4                  8   
10   1187899        1    train            11          4                  8   

    days_since_prior_order  
0                      NaN  
1    

In [3]:
import torch
from spotlight.factorization.explicit import ExplicitFactorizationModel
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.evaluation import precision_recall_score
import numpy as np
from spotlight.cross_validation import random_train_test_split

def evaluate_model(model, interactions, test_percentage=0.2, random_state=None):
    random_state = np.random.RandomState(random_state)
    train, test = random_train_test_split(interactions, test_percentage=test_percentage, random_state=random_state)
    train_mrr, test_mrr = precision_recall_score(model, train, test, k=5)

    print(f"Precision@K:")
    print(f"\t- train: {train_mrr[0].mean():.3f}")
    print(f"\t- test: {test_mrr[0].mean():.3f}")

def objective(trial):
    random_state = np.random.RandomState(42)
    train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=random_state)

    # Define parameters
    param = {
        'embedding_dim': trial.suggest_int("embedding_dim", 5, 64),
        'n_iter': trial.suggest_int("n_iter", 1, 2),
        'learning_rate': trial.suggest_float("learning_rate", 0.001, 1),
        'l2': trial.suggest_float("l2", 1e-10, 1e-06, log=True)
    }

    # Create model
    model = ImplicitFactorizationModel(loss='bpr', **param)
    model.fit(train, verbose=True)

    # Evaluate model
    test_precision, test_recall = precision_recall_score(model, train, test, k=5)
    val_precision = test_precision.mean()

    return val_precision

# Hyperparameter optimization
import optuna
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=1)
# 
# best_params = study.best_params
# for k, v in best_params.items():
#     print(k, ":", v)
# 

best_params = {
    'embedding_dim': 9,
    'n_iter': 1,
    'learning_rate': 0.01,
    'l2': 1e-08,
}

# Train the final model
final_model = ImplicitFactorizationModel(loss='bpr', **best_params)
final_model.fit(interactions, verbose=True)

# Evaluate the final model
evaluate_model(final_model, interactions)


Epoch 0: loss 0.23196373832925865
Precision@K:
	- train: 0.400
	- test: 0.125


In [7]:
import numpy as np
import torch
from spotlight.cross_validation import random_train_test_split


# Класс Model
class Model:
    def __init__(self, model, interactions, user_mappings, product_mappings):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model
        self.user_mappings = user_mappings
        self.product_mappings = product_mappings
        self.internal_user_ids = list(user_mappings.values())
        self.internal_product_ids = list(product_mappings.values())
        self.inv_product_mappings = {v: k for k, v in product_mappings.items()}
        self.rng = np.random.default_rng()

    def get_recommendations(self):
        print(f"Model initialized on device: {self.device}")
        similar_user_id = int(self.rng.choice(self.internal_user_ids))

        # Assuming predict method expects tensors already on the correct device
        with torch.no_grad():
            prediction = self.model.predict(similar_user_id, np.array(self.internal_product_ids))  # Adjust according to actual method signature

        top_indices = np.argsort(prediction)[-1:-6:-1]  # Top 5 recommendations
        top_product_ids = [self.inv_product_mappings[idx] for idx in top_indices]

        return top_product_ids

# Пример использования нового класса Model
user_mappings = {user_id: i for i, user_id in enumerate(np.unique(user_ids))}
product_mappings = {product_id: i for i, product_id in enumerate(np.unique(product_ids))}
recs_model = Model(final_model, interactions, user_mappings, product_mappings)
recs_model.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
recs_product_ids = recs_model.get_recommendations()
recs = products[products['product_id'].isin(recs_product_ids)]
print(recs)

# Сериализация модели
import pickle
with open('../recommendation/0-0-3.pickle', 'wb') as file:
    pickle.dump(recs_model, file, protocol=pickle.HIGHEST_PROTOCOL)


Model initialized on device: cuda
Empty DataFrame
Columns: [product_id, count, product_name, aisle_id, department_id]
Index: []
