In [1]:
# from google.colab import drive
# drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
# !pip install lightfm



In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix
from implicit.nearest_neighbours import bm25_weight
from lightfm import LightFM

import warnings
warnings.filterwarnings('ignore')

In [3]:
# df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/sber/train.csv')
df = pd.read_csv('train.csv')

In [4]:
df.shape

(3123064, 3)

In [5]:
df.head()

Unnamed: 0,user_id,order_completed_at,cart
0,2,2015-03-22 09:25:46,399
1,2,2015-03-22 09:25:46,14
2,2,2015-03-22 09:25:46,198
3,2,2015-03-22 09:25:46,88
4,2,2015-03-22 09:25:46,157


In [6]:
df.tail()

Unnamed: 0,user_id,order_completed_at,cart
3123059,12702,2020-09-03 23:45:45,441
3123060,12702,2020-09-03 23:45:45,92
3123061,12702,2020-09-03 23:45:45,431
3123062,12702,2020-09-03 23:45:45,24
3123063,12702,2020-09-03 23:45:45,430


In [7]:
# Привожу признак "order_completed_at" к формату datetime
df.order_completed_at = pd.to_datetime(df.order_completed_at, format='%Y-%m-%d %H:%M:%S')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3123064 entries, 0 to 3123063
Data columns (total 3 columns):
 #   Column              Dtype         
---  ------              -----         
 0   user_id             int64         
 1   order_completed_at  datetime64[ns]
 2   cart                int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 71.5 MB


In [9]:
print('Количество пропущенных значений:')
for col in df.columns.to_list():
    print(f'{col}: {df[col].isna().sum()}')

Количество пропущенных значений:
user_id: 0
order_completed_at: 0
cart: 0


In [10]:
print(f'Количество пользователей: {df.user_id.nunique()}')
print(f'Количество товаров: {df.cart.nunique()}')

Количество пользователей: 20000
Количество товаров: 881


In [11]:
print(f'Начальная дата: {df.order_completed_at.min()}')
print(f'Конечная дата: {df.order_completed_at.max()}')

Начальная дата: 2015-03-22 09:25:46
Конечная дата: 2020-09-03 23:45:45


In [12]:
df['is_bought'] = 1

Разбиваю датасет на тренировочный и валидационный. Для валидации возьму заказы с датой после 1 августа 2020 года.

In [13]:
train_df = df.loc[df.order_completed_at < datetime.strptime('2020-08-01 00:00:00', '%Y-%m-%d %H:%M:%S'), :]
valid_df = df.loc[df.order_completed_at >= datetime.strptime('2020-08-01 00:00:00', '%Y-%m-%d %H:%M:%S'), :]

In [14]:
train_df.head(2)

Unnamed: 0,user_id,order_completed_at,cart,is_bought
0,2,2015-03-22 09:25:46,399,1
1,2,2015-03-22 09:25:46,14,1


In [15]:
valid_df.head(2)

Unnamed: 0,user_id,order_completed_at,cart,is_bought
2612502,18490,2020-08-01 00:15:22,443,1
2612503,18490,2020-08-01 00:15:22,119,1


In [16]:
print(f'Тренировка.\nКоличество пользователей: {train_df.user_id.nunique()}, количество товаров: {train_df.cart.nunique()}')
print(f'Валидация:\nКоличество пользователей: {valid_df.user_id.nunique()}, количество товаров: {valid_df.cart.nunique()}')

Тренировка.
Количество пользователей: 19270, количество товаров: 834
Валидация:
Количество пользователей: 13799, количество товаров: 744


In [17]:
# Оставляю только общих пользователей и товары
common_users = train_df.user_id.unique()
common_items = train_df.cart.unique()

valid_df = valid_df[valid_df.user_id.isin(common_users)]
valid_df = valid_df[valid_df.cart.isin(common_items)]
print(f'Тренировка.\nКоличество пользователей: {train_df.user_id.nunique()}, количество товаров: {train_df.cart.nunique()}')
print(f'Валидация:\nКоличество пользователей: {valid_df.user_id.nunique()}, количество товаров: {valid_df.cart.nunique()}')

Тренировка.
Количество пользователей: 19270, количество товаров: 834
Валидация:
Количество пользователей: 13068, количество товаров: 690


Подготавливаю датасет

In [18]:
user_item_matrix = pd.pivot_table(train_df, 
                                  index='user_id', columns='cart', 
                                  values='is_bought',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
user_item_matrix.head(2)

cart,0,1,2,3,4,5,6,7,8,9,...,858,859,863,865,866,867,868,869,873,880
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
user_item_matrix.shape

(19270, 834)

In [20]:
# Подготавливаю словари
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [21]:
# Удаляю order_completed_at из датафрейма
train_df.drop('order_completed_at', axis=1, inplace=True)
valid_df.drop('order_completed_at', axis=1, inplace=True)

In [22]:
%%time
new_train_data = []
for user in train_df.user_id.unique():
    bought_items = train_df.loc[train_df.user_id == user, 'cart'].to_list()

    for i in range(len(bought_items)):
        new_item = np.random.choice(train_df.cart.unique(), size=1)[0]
        if new_item not in bought_items:
            new_train_data.append({'user_id': user, 'cart': new_item})

Wall time: 12h 3min 22s


In [23]:
additional_train_df = pd.DataFrame(new_train_data)
additional_train_df['is_bought'] = 0
additional_train_df.head()

Unnamed: 0,user_id,cart,is_bought
0,2,140,0
1,2,292,0
2,2,609,0
3,2,436,0
4,2,338,0


In [24]:
additional_train_df.to_csv('additional_train_df.csv', index=False)

In [25]:
# additional_train_df = pd.read_csv('additional_train_df.csv')

In [26]:
train_df = train_df.append(additional_train_df, ignore_index=True)
train_df.shape

(4948537, 3)

In [23]:
# new_valid_data = []
# for user in valid_df.user_id.unique():
#   bought_items = valid_df.loc[valid_df.user_id == user, 'cart'].to_list()

#   for i in range(len(bought_items)):
#     new_item = np.random.choice(valid_df.cart.unique(), size=1)[0]
#     if new_item not in bought_items:
#       new_valid_data.append({'user_id': user, 'cart': new_item})

In [25]:
# additional_valid_df = pd.DataFrame(new_valid_data)
# additional_valid_df['is_bought'] = 0
# additional_valid_df.head()

NameError: ignored

In [None]:
# additional_valid_df.to_csv('/gdrive/My Drive/Colab Notebooks/sber/additional_valid_df.csv', index=False)

In [28]:
# additional_valid_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/sber/additional_valid_df.csv')
additional_valid_df = pd.read_csv('additional_valid_df.csv')

In [29]:
valid_df = valid_df.append(additional_valid_df, ignore_index=True)
valid_df.shape

(922500, 3)

In [30]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
valid_df = valid_df.sample(frac=1).reset_index(drop=True)

In [31]:
X_train = train_df.drop('is_bought', axis=1)
y_train = train_df['is_bought']

X_valid = valid_df.drop('is_bought', axis=1)
y_valid = valid_df['is_bought']

In [32]:
# подготавливаю id для юзеров и товаров в порядке пар user-item
users_ids_row_train = X_train['user_id'].apply(lambda x: userid_to_id[x]).values.astype(int)
items_ids_row_train = X_train['cart'].apply(lambda x: itemid_to_id[x]).values.astype(int)

users_ids_row_valid = X_valid['user_id'].apply(lambda x: userid_to_id[x]).values.astype(int)
items_ids_row_valid = X_valid['cart'].apply(lambda x: itemid_to_id[x]).values.astype(int)

Обучаю модель

In [33]:
user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [34]:
model = LightFM(no_components=40,
                loss='warp',
                learning_rate=0.01, 
                item_alpha=0.4,
                user_alpha=0.1,
                max_sampled=100)

In [35]:
model.fit((sparse_user_item > 0) * 1,
          sample_weight=coo_matrix(user_item_matrix),
          epochs=20, 
          num_threads=4,
          verbose=True)

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 20/20 [03:02<00:00,  9.10s/it]


<lightfm.lightfm.LightFM at 0x197d22713a0>

In [36]:
# модель возвращает меру/скор похожести между соответствующим пользователем и товаром
predictions_train = model.predict(user_ids=users_ids_row_train,
                            item_ids=items_ids_row_train,
                            num_threads=4)

predictions_valid = model.predict(user_ids=users_ids_row_valid,
                            item_ids=items_ids_row_valid,
                            num_threads=4)

In [59]:
# from sklearn.preprocessing import normalize, StandardScaler

In [56]:
# norm_preds = normalize(predictions.reshape(-1, 1))

In [60]:
# std = StandardScaler()

In [63]:
# std_preds = std.fit_transform(predictions.reshape(-1, 1))

In [65]:
# min(std_preds)

array([-555.2281], dtype=float32)

In [37]:
threshold = 0

In [38]:
y_train_preds = np.where(predictions_train > threshold, 1, 0)
y_valid_preds = np.where(predictions_valid > threshold, 1, 0)

In [39]:
f1_score(y_train, y_train_preds)

0.7677272257858653

In [40]:
f1_score(y_valid, y_valid_preds)

0.7697306749547065

GRID SEARCH

In [64]:
from itertools import product

In [95]:
no_components_list = [20, 40, 60]
loss_list = ['warp', 'bpr']
learning_rate_list = [0.01]
item_alpha_list = [0.2, 0.4, 0.6]
user_alpha_list = [0.1, 0.3, 0.5]
max_sampled_list = [50, 100, 150]

In [96]:
def valid_model(no_components, loss, learning_rate, item_alpha, user_alpha, max_sampled, thr):
    
    model_ = LightFM(no_components=no_components, loss=loss, learning_rate=learning_rate, 
                     item_alpha=item_alpha, user_alpha=user_alpha, max_sampled=max_sampled)
    model_.fit((sparse_user_item > 0) * 1,
              sample_weight=coo_matrix(user_item_matrix),
              epochs=20, 
              num_threads=4)
    predictions_train_ = model_.predict(user_ids=users_ids_row_train,
                            item_ids=items_ids_row_train,
                            num_threads=4)

    predictions_valid_ = model_.predict(user_ids=users_ids_row_valid,
                            item_ids=items_ids_row_valid,
                            num_threads=4)
    y_train_preds_ = np.where(predictions_train_ > thr, 1, 0)
    y_valid_preds_ = np.where(predictions_valid_ > thr, 1, 0)
    return f1_score(y_train, y_train_preds_), f1_score(y_valid, y_valid_preds_)

In [99]:
# %%time
# gs_result = []
# for no_components, loss, learning_rate, item_alpha, user_alpha, max_sampled in product(
#         no_components_list, loss_list, learning_rate_list, item_alpha_list, 
#         user_alpha_list, max_sampled_list):
#     f1_train, f1_valid = valid_model(no_components, loss, learning_rate, item_alpha, 
#                                      user_alpha, max_sampled, 0)
#     gs_result.append({'f1_train': f1_train, 'f1_valid': f1_valid, 'no_components': no_components,
#                      'loss': loss, 'learning_rate': learning_rate, 'item_alpha': item_alpha,
#                      'user_alpha': user_alpha, 'max_sampled': max_sampled})

Wall time: 6h 34min 14s


In [100]:
import json
with open('gs_result.json', 'w') as f:
    json.dump(gs_result, f)

In [107]:
gs_df = pd.DataFrame(gs_result)

In [108]:
gs_df.head()

Unnamed: 0,f1_train,f1_valid,no_components,loss,learning_rate,item_alpha,user_alpha,max_sampled
0,0.878673,0.869371,20,warp,0.01,0.2,0.1,50
1,0.885682,0.873681,20,warp,0.01,0.2,0.1,100
2,0.888318,0.875106,20,warp,0.01,0.2,0.1,150
3,0.918947,0.886332,20,warp,0.01,0.2,0.3,50
4,0.91894,0.884842,20,warp,0.01,0.2,0.3,100


In [109]:
gs_df.iloc[np.argmax(gs_df.f1_train)]

f1_train         0.918947
f1_valid         0.886332
no_components          20
loss                 warp
learning_rate        0.01
item_alpha            0.2
user_alpha            0.3
max_sampled            50
Name: 3, dtype: object

In [110]:
gs_df.iloc[np.argmax(gs_df.f1_valid)]

f1_train         0.917503
f1_valid         0.887314
no_components          40
loss                 warp
learning_rate        0.01
item_alpha            0.2
user_alpha            0.3
max_sampled           150
Name: 59, dtype: object

In [111]:
new_model = LightFM(no_components=40,
                loss='warp',
                learning_rate=0.01, 
                item_alpha=0.2,
                user_alpha=0.3,
                max_sampled=150)

new_model.fit((sparse_user_item > 0) * 1,
          sample_weight=coo_matrix(user_item_matrix),
          epochs=20, 
          num_threads=4,
          verbose=True)

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 20/20 [05:29<00:00, 16.48s/it]


<lightfm.lightfm.LightFM at 0x19794573220>

In [112]:
new_predictions_train = new_model.predict(user_ids=users_ids_row_train,
                            item_ids=items_ids_row_train,
                            num_threads=4)

new_predictions_valid = new_model.predict(user_ids=users_ids_row_valid,
                            item_ids=items_ids_row_valid,
                            num_threads=4)

In [113]:
new_y_train_preds = np.where(new_predictions_train > threshold, 1, 0)
new_y_valid_preds = np.where(new_predictions_valid > threshold, 1, 0)

In [114]:
f1_score(y_train, new_y_train_preds)

0.9186041391168712

In [115]:
f1_score(y_valid, new_y_valid_preds)

0.8858706910479256

Подберу оптимальное значение threshhold

In [78]:
thresholds_list = []
for thr in np.arange(-0.5, 0.51, 0.05):
    preds_ = np.where(std_preds > thr, 1, 0)
    # std_preds_ = std.transform(preds_.reshape(-1, 1))
    f_score_ = f1_score(y_val, preds_)
    thresholds_list.append({'threshold': thr, 'F1_score': f_score_})

In [79]:
thresholds_df = pd.DataFrame(thresholds_list)
thresholds_df

Unnamed: 0,threshold,F1_score
0,-0.5,0.679721
1,-0.45,0.679721
2,-0.4,0.679722
3,-0.35,0.679724
4,-0.3,0.679728
5,-0.25,0.679732
6,-0.2,0.679733
7,-0.15,0.679732
8,-0.1,0.679744
9,-0.05,0.679737


In [80]:
thresholds_df.iloc[np.argmax(thresholds_df.F1_score)]

threshold   -1.110223e-16
F1_score     6.798534e-01
Name: 10, dtype: float64

In [76]:
threshold = 0
y_preds = np.where(std_preds > threshold, 1, 0)

In [77]:
f1_score(y_val, y_preds)

0.6798534368037586

## Обучаю модель на всем датасете

In [104]:
test_user_item_matrix = pd.pivot_table(df, 
                                  index='user_id', columns='cart', 
                                  values='is_bought',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

test_user_item_matrix = test_user_item_matrix.astype(float)
test_sparse_user_item = csr_matrix(test_user_item_matrix).tocsr()

In [109]:
# Подготавливаю словари
test_userids = test_user_item_matrix.index.values
test_itemids = test_user_item_matrix.columns.values

test_matrix_userids = np.arange(len(test_userids))
test_matrix_itemids = np.arange(len(test_itemids))

test_id_to_itemid = dict(zip(test_matrix_itemids, itemids))
test_id_to_userid = dict(zip(test_matrix_userids, userids))

test_itemid_to_id = dict(zip(test_itemids, test_matrix_itemids))
test_userid_to_id = dict(zip(test_userids, test_matrix_userids))

In [105]:
final_model = LightFM(no_components=40,
                      loss='warp',
                      learning_rate=0.01, 
                      item_alpha=0.4,
                      user_alpha=0.1, 
                      random_state=42,
                      k=5,
                      n=15,
                      max_sampled=100)

final_model.fit((test_sparse_user_item > 0) * 1,
                sample_weight=coo_matrix(test_user_item_matrix),
                epochs=20, 
                num_threads=4,
                verbose=True)

Epoch: 100%|██████████| 20/20 [00:54<00:00,  2.75s/it]


<lightfm.lightfm.LightFM at 0x7f343ac66d50>

## Прогноз для каждой пары пользователь-категория из примера сабмита

In [106]:
submission_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/sber/sample_submission.csv')
submission_df.drop('target', axis=1, inplace=True)
submission_df.reset_index(drop=True)
submission_df.head()

Unnamed: 0,id
0,0;133
1,0;5
2,0;10
3,0;396
4,0;14


In [107]:
test_df = submission_df['id'].str.split(';', expand=True)
test_df.columns = ['user_id', 'category_id']
test_df = test_df.astype('int')
test_df.head()

Unnamed: 0,user_id,category_id
0,0,133
1,0,5
2,0,10
3,0,396
4,0,14


In [108]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 790449 entries, 0 to 790448
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      790449 non-null  int64
 1   category_id  790449 non-null  int64
dtypes: int64(2)
memory usage: 12.1 MB


In [110]:
test_users_ids_row = test_df['user_id'].apply(lambda x: test_userid_to_id[x]).values.astype(int)
test_items_ids_row = test_df['category_id'].apply(lambda x: test_itemid_to_id[x]).values.astype(int)

In [121]:
test_predictions = final_model.predict(user_ids=test_users_ids_row,
                            item_ids=test_items_ids_row,
                            num_threads=4)
test_preds = np.where(test_predictions > -0.25, 1, 0)

In [122]:
submission_df['target'] = test_preds
submission_df.head()

Unnamed: 0,id,target
0,0;133,1
1,0;5,1
2,0;10,1
3,0;396,1
4,0;14,1


In [123]:
submission_df.target.value_counts()

1    764389
0     26060
Name: target, dtype: int64

In [125]:
submission_df.to_csv('/gdrive/My Drive/Colab Notebooks/sber/AVasilev_submission.csv', sep=',', index=False)