# Clicks Model and Prediction
A model predicting which aid user is going to click next is trained in this notebook, and then the model is used to make predictions for the test dataset. This same notebook was also used for cross-validating the clicks model. This notebook uses a single input - "W2vec features for clicks" notebook, all the inputs are already put together in previous notebooks.
Here I tried both catboost and LGBM models, and LGBM showed better results. So, I've used the LGBM model to produce final results. The code for catboost model is kept as comments.
## Imports and definitions

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#from catboost import CatBoostRanker, Pool
import gc
from humanize import naturalsize
from lightgbm.sklearn import LGBMRanker
from sklearn.model_selection import KFold

# functions and classes common for several notebooks of current project
import otto_common

In [2]:
# Remove columns and downcast floats to float16.
def prepare_df(df):
    del df['day_of_week'], df['first_aid'], df['second_aid']
    floats = df.select_dtypes(include=['float64','float32'])
    for col in floats:
        df[col] = df[col].astype(np.float16)
        gc.collect()
    return df

In [3]:
# This function is used instead of GroupKFold to save some memory during cross-validation.
def add_fold_column(df, n_splits):
    df_session = pd.DataFrame({'session':df['session'].unique()})
    kf = KFold(n_splits=4, random_state=13, shuffle=True)
    df_session['fold'] = -1
    df_session['fold'] = df_session['fold'].astype(np.int8)
    for i, (train_index, test_index) in enumerate(kf.split(df_session)):
        df_session.iloc[test_index, 1] = i
    df = df.merge(df_session, how='left', on='session')
    return df

## Load and prepare data

In [4]:
df_train = pd.read_parquet('/kaggle/input/otto-time-viewed/cv1_features_with_w2v.parquet')

In [5]:
df_train = prepare_df(df_train)
gc.collect()

0

In [6]:
size = df_train.memory_usage(deep='True').sum()
print(naturalsize(size))

2.9 GB


In [7]:
# Set a few variables, that are used both in cross-validation and while creating submission.
CROSS_VALIDATE = False # Should be changed to False to produce submission.
x_cols = list(df_train.columns[3:])
frac = 0.7 #fracture of records with target==False to be dropped from train to reduce memory usage

In [8]:
'''
# Set the catboost model
parameters = {
    'iterations': 300,
    'loss_function': 'QuerySoftMax',    
    'learning_rate': 0.15,
    #'custom_metric': 'RecallAt:top=20',
    'depth' : 7,
    'verbose': 5,
    'random_seed': 0, 
    'task_type' : 'GPU'
}
model = CatBoostRanker(**parameters)
print('model_defined')
'''

"\n# Set the catboost model\nparameters = {\n    'iterations': 300,\n    'loss_function': 'QuerySoftMax',    \n    'learning_rate': 0.15,\n    #'custom_metric': 'RecallAt:top=20',\n    'depth' : 7,\n    'verbose': 5,\n    'random_seed': 0, \n    'task_type' : 'GPU'\n}\nmodel = CatBoostRanker(**parameters)\nprint('model_defined')\n"

In [9]:
parameters = {
    "objective" : "lambdarank",
    "metric" : "ndcg",
    "boosting_type" : "gbdt",
    'min_child_samples' : 200,
    "n_estimators" : 200,
    "num_leaves" : 64,
    "importance_type" : 'gain',
    'max_depth' : 7,
    'learning_rate' : 0.1,
    'random_state' : 21,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,    
}
model = LGBMRanker(**parameters)

print('model_defined')

model_defined


## Cross-validation

In [10]:
'''
%%time
# A cell that performs cross-validation for the catboost model.

CROSS_VALIDATE = True
x_cols = list(df_train.columns[3:])
frac = 0.7 #fracture of records with target==False to be dropped from train to reduce memory usage
df_train = df_train.reset_index(drop=True)

if CROSS_VALIDATE:
    # Define the cross-validation splits and a column for prediction
    n_splits = 4
    df_train = add_fold_column(df_train, n_splits)
    df_importances = pd.DataFrame({'columns':x_cols})
    df_train['cv_prediction'] = -1
    df_train['cv_prediction'] = df_train['cv_prediction'].astype(np.float16)
    
    # Fit the model and save the predictions.
    for i in range(n_splits):
        train_index = df_train.loc[df_train['fold'] != i].index
        train_index = otto_common.remove_frac(train_index, df_train, frac)
        gc.collect()
        print('start_fitting, fold = ' + str(i))
        train_pool = Pool(
            data=df_train[x_cols].iloc[train_index],
            label=df_train.iloc[train_index, 2].astype(np.int8),
            group_id=df_train.iloc[train_index, 0]
        )
        gc.collect()
        print('start_fitting')
        model.fit(train_pool)
        cv_pool = Pool(
            data=df_train[x_cols].iloc[test_index],
            label=df_train.iloc[test_index, 2].astype(np.int8),
            group_id=df_train.iloc[test_index, 0]
        )
        df_train['cv_prediction'].iloc[test_index] = model.predict(cv_pool)
        del train_pool, cv_pool
        gc.collect()
'''

"\n%%time\n# A cell that performs cross-validation for the catboost model.\n\nCROSS_VALIDATE = True\nx_cols = list(df_train.columns[3:])\nfrac = 0.7 #fracture of records with target==False to be dropped from train to reduce memory usage\ndf_train = df_train.reset_index(drop=True)\n\nif CROSS_VALIDATE:\n    # Define the cross-validation splits and a column for prediction\n    n_splits = 4\n    df_train = add_fold_column(df_train, n_splits)\n    df_importances = pd.DataFrame({'columns':x_cols})\n    df_train['cv_prediction'] = -1\n    df_train['cv_prediction'] = df_train['cv_prediction'].astype(np.float16)\n    \n    # Fit the model and save the predictions.\n    for i in range(n_splits):\n        train_index = df_train.loc[df_train['fold'] != i].index\n        train_index = otto_common.remove_frac(train_index, df_train, frac)\n        gc.collect()\n        print('start_fitting, fold = ' + str(i))\n        train_pool = Pool(\n            data=df_train[x_cols].iloc[train_index],\n        

In [11]:
%%time
# A cell that performs cross-validation for the LGBM model.

if CROSS_VALIDATE:
    # Define the cross-validation splits and a column for prediction
    n_splits = 4
    df_train = add_fold_column(df_train, n_splits)
    df_importances = pd.DataFrame({'columns':x_cols})
    df_train['cv_prediction'] = -1
    df_train['cv_prediction'] = df_train['cv_prediction'].astype(np.float16)
    
    # Fit the model and save the predictions.
    for i in range(n_splits):
        train_index = df_train.loc[df_train['fold'] != i].index
        train_index = otto_common.remove_frac(train_index, df_train, frac)
        gc.collect()
        print('start_fitting, fold = ' + str(i))
        model.fit(
            df_train[x_cols].iloc[train_index],
            df_train.iloc[train_index, 2].astype(np.int8),
            group=df_train.iloc[train_index].groupby('session').size(),
        )
        column_name = 'imp_' + str(i)
        df_importances[column_name] = model.feature_importances_
        test_index = df_train.loc[df_train['fold'] == i].index
        df_train['cv_prediction'].iloc[test_index] = model.predict(df_train[x_cols].iloc[test_index])
        del train_index, test_index
        gc.collect()
    df_importances['imp_avg'] = df_importances.mean(axis=1)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs


In [12]:
#df_importances

In [13]:
#df_importances

In [14]:
# Print some stats for cross-validation results
if CROSS_VALIDATE:
    otto_common.calculate_recall(df_train, 'cv_prediction', 1738122)

## Fit the test model

In [15]:
# Drop a fracture of records with target==False from train to reduce memory usage.
if frac > 0:
    remove_index = df_train.loc[df_train['target'] == False].sample(frac=frac, random_state=25).index
    df_train = df_train.drop(remove_index)
    del remove_index
    gc.collect()

In [16]:
'''
# Fit the catboost model
train_pool = Pool(
   data=df_train[x_cols],
   label=df_train.iloc[:,2].astype(np.int8),
   group_id=df_train.iloc[:,0]
)
model.fit(train_pool)

del df_train
gc.collect()
'''

'\n# Fit the catboost model\ntrain_pool = Pool(\n   data=df_train[x_cols],\n   label=df_train.iloc[:,2].astype(np.int8),\n   group_id=df_train.iloc[:,0]\n)\nmodel.fit(train_pool)\n\ndel df_train\ngc.collect()\n'

In [17]:
# Fit the LGBM model.
model.fit(
    df_train[x_cols],
    df_train.iloc[:, 2].astype(np.int8),
    group=df_train.groupby('session').size(),
)

del df_train
gc.collect()

50

In [18]:
'''
%%time
# This code was used to view the feature importances for catboost model.

imps = model.get_feature_importance(train_pool)
df_imps = pd.DataFrame({'columns':x_cols, 'importances': imps})
df_imps.to_parquet('importances.parquet')

del train_pool
gc.collect()
'''

"\n%%time\n# This code was used to view the feature importances for catboost model.\n\nimps = model.get_feature_importance(train_pool)\ndf_imps = pd.DataFrame({'columns':x_cols, 'importances': imps})\ndf_imps.to_parquet('importances.parquet')\n\ndel train_pool\ngc.collect()\n"

## Predict and export

In [19]:
# Make predictions using LGBM model for both chunks of test data. 
for i in range(2):
    file_path = '/kaggle/input/otto-time-viewed/test_features_with_w2v_part_' + str(i) + '.parquet'
    print('Start predicting '+ str(i))
    j_max = 5
    for j in range(j_max):
        df_test = pd.read_parquet(file_path)
        df_test = otto_common.divide_df_by_column(df_test, j_max, j, 'session')
        df_test = prepare_df(df_test)
        print('Loading finished '+ str(i) + '__' + str(j))
        df_test['gbdt_prediction'] = model.predict(df_test[x_cols])
        df_test = df_test[['session','click_predictions','gbdt_prediction']]
        gc.collect()
        if (i == 0) & (j == 0):
            df_test_all = df_test
        else:
            df_test_all = pd.concat([df_test_all, df_test])
    print('Predictions made '+ str(i))
del df_test
gc.collect()

Start predicting 0
Loading finished 0__0
Loading finished 0__1
Loading finished 0__2
Loading finished 0__3
Loading finished 0__4
Predictions made 0
Start predicting 1
Loading finished 1__0
Loading finished 1__1
Loading finished 1__2
Loading finished 1__3
Loading finished 1__4
Predictions made 1


0

In [20]:
'''
# Make predictions using catboost model for both chunks of test data.
for i in range(2):
    file_path = '/kaggle/input/otto-time-viewed/test_features_with_w2v_part_' + str(i) + '.parquet'
    print('Start predicting '+ str(i))
    j_max = 5
    for j in range(j_max):
        df_test = pd.read_parquet(file_path)
        df_test = otto_common.divide_df_by_column(df_test, j_max, j, 'session')
        df_test = prepare_df(df_test)
        print('Loading finished '+ str(i) + '__' + str(j))
        test_pool = Pool(
            data=df_test[x_cols],
            group_id=df_test['session']
        )
        df_test['gbdt_prediction'] = model.predict(test_pool)
        df_test = df_test[['session','click_predictions','gbdt_prediction']]
        del test_pool
        gc.collect()
        if (i == 0) & (j == 0):
            df_test_all = df_test
        else:
            df_test_all = pd.concat([df_test_all, df_test])
    print('Predictions made '+ str(i))
del df_test, test_pool
gc.collect()
'''

"\n# Make predictions using catboost model for both chunks of test data.\nfor i in range(2):\n    file_path = '/kaggle/input/otto-time-viewed/test_features_with_w2v_part_' + str(i) + '.parquet'\n    print('Start predicting '+ str(i))\n    j_max = 5\n    for j in range(j_max):\n        df_test = pd.read_parquet(file_path)\n        df_test = otto_common.divide_df_by_column(df_test, j_max, j, 'session')\n        df_test = prepare_df(df_test)\n        print('Loading finished '+ str(i) + '__' + str(j))\n        test_pool = Pool(\n            data=df_test[x_cols],\n            group_id=df_test['session']\n        )\n        df_test['gbdt_prediction'] = model.predict(test_pool)\n        df_test = df_test[['session','click_predictions','gbdt_prediction']]\n        del test_pool\n        gc.collect()\n        if (i == 0) & (j == 0):\n            df_test_all = df_test\n        else:\n            df_test_all = pd.concat([df_test_all, df_test])\n    print('Predictions made '+ str(i))\ndel df_test,

In [21]:
'''
# Enumereate the results and select top 20 for each session.
df_test_all = df_test_all.sort_values(['session','gbdt_prediction'],ascending=[True,False])
df_test_all['n'] = df_test_all.groupby('session').cumcount().astype(np.int8)
df_test_all = df_test_all.loc[df_test_all['n'] < 20].drop('n',axis=1)

# Final formatting.
df_test_all['click_predictions'] = df_test_all['click_predictions'].apply(str)
df_test_all = (df_test_all.groupby('session').agg({'click_predictions': lambda x: " ".join(x)}))
'''

'\n# Enumereate the results and select top 20 for each session.\ndf_test_all = df_test_all.sort_values([\'session\',\'gbdt_prediction\'],ascending=[True,False])\ndf_test_all[\'n\'] = df_test_all.groupby(\'session\').cumcount().astype(np.int8)\ndf_test_all = df_test_all.loc[df_test_all[\'n\'] < 20].drop(\'n\',axis=1)\n\n# Final formatting.\ndf_test_all[\'click_predictions\'] = df_test_all[\'click_predictions\'].apply(str)\ndf_test_all = (df_test_all.groupby(\'session\').agg({\'click_predictions\': lambda x: " ".join(x)}))\n'

In [22]:
# Select top 20 candidates and format as required by organizers.
df_test_all = otto_common.select_top_20_and_format(df_test_all, 'click_predictions','gbdt_prediction')

In [23]:
# Save to file.
df_test_all.to_parquet('click_predictions.parquet')