# Orders Model
A model predicting which aids user is going to order is trained in this notebook. This same notebook was also used for cross-validating the orders model. Unlike the clicks model, for orders, prediction is made in a separate notebook. On kaggle platform, notebooks with GPU have less memory available, and it was hard to fit all the required data into 13 GB of available RAM, so I had to move prediction to a different notebook without GPU support, but with 30Gb RAM available.
For orders, the prediction is made using two different models, LGBM model and  catboost model, trained on two cross-validation datasets.
This notebook uses input from two "parallel" notebooks that produce w2vec features for orders, one for one of cross-validation sets and half of the test set and the other one for another cross-validation set the other half of the test set.
## Imports and definitions

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc
from humanize import naturalsize
from lightgbm.sklearn import LGBMRanker
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import GroupKFold
import joblib

# functions and classes common for several notebooks of current project
import otto_common

In [2]:
# This function was used to test new features before adding them to the pipeline.
# Now it only deletes the day_of_week column, which is used to construct some features.
def prepare_df(df):
    del df['day_of_week']
    return df

## Load and prepare data

In [3]:
# Load the train/cross-validation data.

df_train = pd.read_parquet('/kaggle/input/otto-orders-w2vec/train_features_with_w2v_cv1.parquet')
#df_train = pd.read_parquet('/kaggle/input/otto-orders-w2vec-part1/train_features_with_w2v_cv2.parquet')

In [4]:
# A few checks and preparations.
df_train = prepare_df(df_train)

assert len(df_train[df_train.duplicated(subset=['session','order_predictions'], keep=False)]) == 0

size = df_train.memory_usage(deep='True').sum()
print(naturalsize(size))

1.0 GB


In [5]:
# Set the LGBM model's parameters.
parameters = {
    "objective" : "lambdarank",
    "metric" : "ndcg",
    "boosting_type" : "gbdt",
    'min_child_samples' : 100,
    "n_estimators" : 299,
    "num_leaves" : 128,
    "importance_type" : 'gain',
    'max_depth' : 8,
    'learning_rate' : 0.07,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,    
}
model = LGBMRanker(**parameters)

print('model_defined')


model_defined


In [6]:
# Set the catboost model's parameters.
catboost_parameters = {
    'iterations': 800,
    'loss_function': 'QuerySoftMax',    
    'learning_rate': 0.12,
    'depth' : 8,
    'verbose': 50,
    'random_seed': 0, 
    'task_type' : 'GPU'
}
model_catboost = CatBoostRanker(**catboost_parameters)
print('model_defined')

model_defined


In [7]:
# A few global parameters, used both for creating submission and cross-validation.
CROSS_VALIDATE = False # Should be changed to False to produce submission.
frac = 0.4 #fracture of records with target==False to be dropped from train to reduce memory usage
x_cols = list(df_train.columns[3:])

## Cross-validation

In [8]:
%%time
# Cell to cross-validate the LGBM model.

if CROSS_VALIDATE:
    # Define the splits and prepare a column to save results.
    n_splits = 4
    groups_by_session = df_train['session'].copy().tolist()
    group_kfold = GroupKFold(n_splits=n_splits)    
    df_importances = pd.DataFrame({'columns':x_cols})
    df_train['cv_prediction'] = -1
    df_train['cv_prediction'] = df_train['cv_prediction'].astype(np.float32)
    
    # Fit the model and save the results.
    for i, (train_index, test_index) in enumerate(group_kfold.split(df_train[x_cols], df_train['target'], groups_by_session)):
        train_index = otto_common.remove_frac(train_index, df_train, frac)
        gc.collect()
        print('start_fitting')
        model.fit(
            df_train[x_cols].iloc[train_index],
            df_train.iloc[train_index, 2].astype(np.int8),
            group=df_train.iloc[train_index].groupby('session').size(),
        )
        column_name = 'imp_' + str(i)
        df_importances[column_name] = model.feature_importances_
        df_train['cv_prediction'].iloc[test_index] = model.predict(df_train[x_cols].iloc[test_index])
        gc.collect()
    del groups_by_session, group_kfold, train_index, test_index
    gc.collect()
    df_importances['imp_avg'] = df_importances.mean(axis=1)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


In [9]:
# View feature_importances. Two cells were used to print feature importances so that it would be possible to compare the values between two runs.
#df_importances

In [10]:
#df_importances

In [11]:
# Print the LGBM cross-validation results.
if CROSS_VALIDATE:
    otto_common.calculate_recall(df_train, 'cv_prediction', 311027)

In [12]:
%%time
# Cell to cross-validate the catboost model.

if CROSS_VALIDATE:
    n_splits = 4
    groups_by_session = df_train['session'].copy().tolist()
    group_kfold = GroupKFold(n_splits=n_splits)    
    df_importances = pd.DataFrame({'columns':x_cols})
    df_train['catboost_prediction'] = -1
    df_train['catboost_prediction'] = df_train['catboost_prediction'].astype(np.float32)
    
    # Fitting the model and saving the results.
    for i, (train_index, test_index) in enumerate(group_kfold.split(df_train[x_cols], df_train['target'], groups_by_session)):
        train_index = otto_common.remove_frac(train_index, df_train, frac)
        train_pool = Pool(
            data=df_train[x_cols].iloc[train_index],
            label=df_train.iloc[train_index, 2].astype(np.int8),
            group_id=df_train.iloc[train_index, 0]
        )
        cv_pool = Pool(
            data=df_train[x_cols].iloc[test_index],
            label=df_train.iloc[test_index, 2].astype(np.int8),
            group_id=df_train.iloc[test_index, 0]
        )
        gc.collect()
        print('start_fitting')
        model_catboost.fit(train_pool, eval_set=cv_pool)

        df_train['catboost_prediction'].iloc[test_index] = model_catboost.predict(cv_pool)
        del train_pool, cv_pool
        gc.collect()
    del groups_by_session, group_kfold, train_index, test_index
    gc.collect()

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.68 µs


In [13]:
# Print the catboost cross-validation results.
if CROSS_VALIDATE:
    otto_common.calculate_recall(df_train, 'catboost_prediction', 311027)

In [14]:
# Save both LGBM and catboost cross-validation results to file.
if CROSS_VALIDATE:
    df_train.to_parquet('cv_predictions_orders.parquet')

## Fit the test models

In [15]:
# Fit the LGBM model.
if not CROSS_VALIDATE:
    if frac > 0:
        remove_index = df_train.loc[df_train['target'] == False].sample(frac=frac, random_state=25).index
        df_train = df_train.drop(remove_index)
        del remove_index
        gc.collect()
    model.fit(df_train[x_cols],
            df_train.iloc[:,2].astype(np.int8),
            group=df_train.groupby('session').size())
    
    del df_train
    gc.collect()

In [16]:
# Save the LGBM model to file.
if not CROSS_VALIDATE:
    joblib.dump(model, 'lgb.pkl')

In [17]:
# Load and prepare second cross-validation dataset.
if not CROSS_VALIDATE:
    df_train = pd.read_parquet('/kaggle/input/otto-orders-w2vec-part1/train_features_with_w2v_cv2.parquet')
    df_train = prepare_df(df_train)
    
    assert len(df_train[df_train.duplicated(subset=['session','order_predictions'], keep=False)]) == 0

    size = df_train.memory_usage(deep='True').sum()
    print(naturalsize(size))

1.0 GB


In [18]:
# Remove fraction of negative samples and fit the catboost model.
if not CROSS_VALIDATE:
    if frac > 0:
        remove_index = df_train.loc[df_train['target'] == False].sample(frac=frac, random_state=25).index
        df_train = df_train.drop(remove_index)
        del remove_index
        gc.collect()
    train_pool = Pool(
        data=df_train[x_cols],
        label=df_train.iloc[:,2].astype(np.int8),
        group_id=df_train.iloc[:,0]
    )
    model_catboost.fit(train_pool)

0:	learn: 3.3912960	total: 222ms	remaining: 2m 57s
50:	learn: 1.6223405	total: 11.2s	remaining: 2m 43s
100:	learn: 1.5991825	total: 22.1s	remaining: 2m 33s
150:	learn: 1.5890697	total: 33.8s	remaining: 2m 25s
200:	learn: 1.5819851	total: 44.6s	remaining: 2m 12s
250:	learn: 1.5762063	total: 55.7s	remaining: 2m 1s
300:	learn: 1.5713802	total: 1m 7s	remaining: 1m 52s
350:	learn: 1.5668827	total: 1m 18s	remaining: 1m 40s
400:	learn: 1.5626345	total: 1m 29s	remaining: 1m 29s
450:	learn: 1.5589706	total: 1m 41s	remaining: 1m 18s
500:	learn: 1.5554598	total: 1m 52s	remaining: 1m 7s
550:	learn: 1.5521821	total: 2m 4s	remaining: 56.3s
600:	learn: 1.5489818	total: 2m 15s	remaining: 44.9s
650:	learn: 1.5457157	total: 2m 26s	remaining: 33.6s
700:	learn: 1.5429188	total: 2m 38s	remaining: 22.4s
750:	learn: 1.5401895	total: 2m 49s	remaining: 11s
799:	learn: 1.5375254	total: 2m 59s	remaining: 0us


In [19]:
# Export the catboost model to file.
if not CROSS_VALIDATE:
    model_catboost.save_model("model")