In [1]:
import pandas as pd
from scipy import stats
import os
import gc
import numpy as np
import plotly.express as px
import time
import lightgbm as lgb
import pickle
import time
from scipy.stats import pearsonr
from typing import Tuple

from ubq_utilities import (
    get_time_series_cross_val_splits,
    calculate_corr, corr_sharpe_lgb,
    TIME_COL, FOLD_NAME, TARGET_COL,
    STARTING_FEATURE_NAME, N_FOLD, 
    STARTING_CAT_FEAT_NAME, STARTING_NUMERIC_FEAT_NAME,
    RANDOM_STATE, SUBSAMPLE_FOLD
)

N_ROUND = 1500
N_FOLD = 5

path_data = '../input/ubq-preprocess-mean-other'

In [2]:
with open(os.path.join(path_data, 'ALL_FEATURE.pkl'), 'rb') as file:
    ALL_FEATURE = pickle.load(file)

In [3]:
print('Importing pd df')
data = pd.read_pickle(
    os.path.join(path_data, f'train_16_fe.pkl')
)

gc.collect()

Importing pd df


64

In [4]:
print(f'Using: {data.shape[0]} rows')

Using: 3141410 rows


In [5]:
fold_embargo_zip = get_time_series_cross_val_splits(data, cv=N_FOLD, embargo=50, min_time_to_use=0)

fold_split = [
    [
        np.where(data[TIME_COL].isin(train_index))[0], 
        np.where(data[TIME_COL].isin(test_index))[0]
    ]
    for train_index, test_index in fold_embargo_zip
]

time_id_split = [
    data[TIME_COL].loc[test_index].values
    for _, test_index in fold_split
]

first_element_test_split = np.array([
    data[TARGET_COL].loc[test_index].iloc[0]
    for _, test_index in fold_split
])

In [6]:
gc.collect()

21

In [7]:
PARAMS_LGB = {
    'boosting_type': 'gbdt',
#     'tree_learner':'voting',
    'metric': None,
    'objective': 'regression',
    'n_jobs': -1,
    'num_leaves': 2**8,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_freq': 5,
    'bagging_fraction': 0.80,
    'lambda_l2': 1
}


In [8]:
def corr_sharpe_lgb(
    y_pred: np.array, dtrain: lgb.Dataset, 
) -> Tuple[str, float, bool]:
    """
    Pearson correlation coefficient metric
    """
    y_true = dtrain.get_label()
    
    corr = pearsonr(y_true, y_pred)[0]
    return 'pearson_corr', corr, True

In [9]:
progress_list = []
model_list = []

for i, (train_index, test_index) in enumerate(fold_split):

        print(f'\n\nStarting fold {i}\n\n\n')

        train_x_num, train_y = (
            data[ALL_FEATURE].iloc[train_index].to_numpy('float16'),
            data[TARGET_COL].iloc[train_index].to_numpy('float16')
        )
        test_x_num, test_y = (
            data[ALL_FEATURE].iloc[test_index].to_numpy('float16'),
            data[TARGET_COL].iloc[test_index].to_numpy('float16')
        )

        train_matrix = lgb.Dataset(train_x_num, train_y)
        test_matrix = lgb.Dataset(test_x_num, test_y)
                
        progress = {}
        
        callbacks_list = [
            lgb.record_evaluation(progress),
            lgb.log_evaluation(period=50, show_stdv=False)
        ]
        
        model = lgb.train(
                    params=PARAMS_LGB,
                    train_set=train_matrix, 
                    num_boost_round=N_ROUND,
                    valid_sets=[test_matrix],
                    valid_names=['valid'],
                    callbacks=callbacks_list,
                    feval=corr_sharpe_lgb,
                )
        
                        
        del (
            train_x_num, train_y, test_x_num, test_y, train_matrix,
            test_matrix
        )
        
        model_list.append(model)

        model.save_model(f'lgb_{i}.txt')
        progress_list.append(progress)



Starting fold 0



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 103254
[LightGBM] [Info] Number of data points in the train set: 2484830, number of used features: 405
[LightGBM] [Info] Start training from score -0.019712
[50]	valid's l2: 0.859158	valid's pearson_corr: 0.103531
[100]	valid's l2: 0.857062	valid's pearson_corr: 0.11575
[150]	valid's l2: 0.85631	valid's pearson_corr: 0.120681
[200]	valid's l2: 0.855812	valid's pearson_corr: 0.124053
[250]	valid's l2: 0.855601	valid's pearson_corr: 0.126013
[300]	valid's l2: 0.855555	valid's pearson_corr: 0.127458
[350]	valid's l2: 0.855505	valid's pearson_corr: 0.128563
[400]	valid's l2: 0.85549	valid's pearson_corr: 0.129366
[450]	valid's l2: 0.855647	valid's pearson_corr: 0.129369
[500]	valid's l2: 0.855599	valid's pearson_corr: 0.13006
[550]	valid's l2: 0.855572	valid's pearson_corr: 0.130588
[600]	valid's l2: 0.855614	valid's pearson_corr: 0.130784
[650]	valid's l2: 0.855774	valid's pearson_c

# save dataset and make prediction with best epoch

In [10]:
gc.collect()

393

In [11]:
progress_dict = {
        'time': range(N_ROUND),
    }
progress_dict.update(
        {
            f'corr_fold_{i}': progress_list[i]['valid']['pearson_corr']
            for i in range(N_FOLD)
        }
    )
progress_dict.update(
        {
            f'loss_fold_{i}': progress_list[i]['valid']['l2']
            for i in range(N_FOLD)
        }
    )

progress_df = pd.DataFrame(
    progress_dict
)

progress_df['average_corr'] = progress_df.loc[:, ['corr' in x for x in progress_df.columns]].mean(axis =1)
progress_df['average_loss'] = progress_df.loc[:, ['loss' in x for x in progress_df.columns]].mean(axis =1)

best_epoch = int(progress_df['average_corr'].argmax())
best_score = progress_df['average_corr'].max()
best_loss = progress_df.loc[best_epoch, 'average_loss']

print(f'Best epoch: {best_epoch}, CV-Corr: {best_score:.3f}, CV-Loss: {best_loss:.3f}')

best_result = {
    'best_epoch': best_epoch+1,
    'best_score': best_score
}

with open('best_result.pkl', 'wb') as file:
    pickle.dump(best_result, file)

gc.collect()

Best epoch: 336, CV-Corr: 0.135, CV-Loss: 0.832


0

# OOF Prediction

In [12]:
model_list = [
    lgb.Booster(model_file=f'lgb_{i}.txt') for i in range(N_FOLD)
]
oof_predictions = np.zeros(data.shape[0])

for i, (_, test_index) in enumerate(fold_split):
        model = model_list[i]
        
        test_x_num = data[ALL_FEATURE].iloc[test_index].to_numpy('float16')
        val_pred = model.predict(test_x_num, num_iteration = best_result['best_epoch'])
        oof_predictions[test_index] = val_pred
        
oof_df = pd.DataFrame({'time_id': data['time_id'], 'y_true': data['target'], 'y_pred': oof_predictions})

# Save out of folds csv for blending
oof_df.to_csv('lgbm.csv', index = False)

score = calculate_corr(oof_df)[0]
print(f'Our out of folds mean pearson correlation coefficient is {score}')    

Our out of folds mean pearson correlation coefficient is 0.1312682811445559


# CV score

In [13]:
progress_df.to_csv('result.csv', index=False)

In [14]:
for col in ['average_loss', 'average_corr']:
    fig = px.line(
        progress_df, x="time", y=col, 
        title=col, template='plotly_white'
    )

    fig.show()