In [1]:
import pandas as pd
from scipy import stats
import os
import gc
import numpy as np
import plotly.express as px
import time
import lightgbm as lgb
import pickle
import time
from scipy.stats import pearsonr
from typing import Tuple

from ubq_utilities import (
    get_time_series_cross_val_splits,
    calculate_corr, corr_sharpe_lgb,
    TIME_COL, FOLD_NAME, TARGET_COL,
    STARTING_FEATURE_NAME, N_FOLD, 
    STARTING_CAT_FEAT_NAME, STARTING_NUMERIC_FEAT_NAME,
    RANDOM_STATE, SUBSAMPLE_FOLD
)

N_ROUND = 749
N_FOLD = 5

FOLD_USED = [0]
path_data = '../input/ubq-preprocess-mean-other'

In [2]:
with open(os.path.join(path_data, 'ALL_FEATURE.pkl'), 'rb') as file:
    ALL_FEATURE = pickle.load(file)

In [3]:
print('Importing pd df')
data = pd.read_pickle(
    os.path.join(path_data, f'train_16_fe.pkl')
)

gc.collect()

Importing pd df


64

In [4]:
print(f'Using: {data.shape[0]} rows')

Using: 3141410 rows


In [5]:
fold_embargo_zip = get_time_series_cross_val_splits(data, cv=N_FOLD, embargo=50, min_time_to_use=0)

fold_split = [
    [
        np.where(data[TIME_COL].isin(train_index))[0], 
        np.where(data[TIME_COL].isin(test_index))[0]
    ]
    for train_index, test_index in fold_embargo_zip
]

time_id_split = [
    data[TIME_COL].loc[test_index].values
    for _, test_index in fold_split
]

first_element_test_split = np.array([
    data[TARGET_COL].loc[test_index].iloc[0]
    for _, test_index in fold_split
])

In [6]:
gc.collect()

21

In [7]:
PARAMS_LGB = {
    'boosting_type': 'dart',
    'metric': None,
    'objective': 'regression',
    'n_jobs': -1,
    'num_leaves': 2**8,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_freq': 5,
    'bagging_fraction': 0.80,
    'lambda_l2': 1,
}

In [8]:
def corr_sharpe_lgb(
    y_pred: np.array, dtrain: lgb.Dataset, 
) -> Tuple[str, float, bool]:
    """
    Pearson correlation coefficient metric
    """
    y_true = dtrain.get_label()
    
    corr = pearsonr(y_true, y_pred)[0]
    return 'pearson_corr', corr, True

In [9]:
progress_list = []
model_list = []

for i, (train_index, test_index) in enumerate(fold_split):
    if i in FOLD_USED:
        print(f'\n\nStarting fold {i}\n\n\n')

        train_x_num, train_y = (
            data[ALL_FEATURE].iloc[train_index].to_numpy('float16'),
            data[TARGET_COL].iloc[train_index].to_numpy('float16')
        )
        test_x_num, test_y = (
            data[ALL_FEATURE].iloc[test_index].to_numpy('float16'),
            data[TARGET_COL].iloc[test_index].to_numpy('float16')
        )

        train_matrix = lgb.Dataset(train_x_num, train_y)
        test_matrix = lgb.Dataset(test_x_num, test_y)
                
        progress = {}
        
        callbacks_list = [
            lgb.record_evaluation(progress),
            lgb.log_evaluation(period=50, show_stdv=False)
        ]
        
        model = lgb.train(
                    params=PARAMS_LGB,
                    train_set=train_matrix, 
                    num_boost_round=N_ROUND,
                    valid_sets=[test_matrix],
                    valid_names=['valid'],
                    callbacks=callbacks_list,
                    feval=corr_sharpe_lgb,
                )
        
                        
        del (
            train_x_num, train_y, test_x_num, test_y, train_matrix,
            test_matrix
        )
        
        model_list.append(model)

        model.save_model(f'lgb_{i}.txt')
        
        with open(f'progress_fold_{i}', 'wb') as file:
            pickle.dump(progress, file)



Starting fold 0



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 103254
[LightGBM] [Info] Number of data points in the train set: 2484830, number of used features: 405
[LightGBM] [Info] Start training from score -0.019712
[50]	valid's l2: 0.859711	valid's pearson_corr: 0.101143
[100]	valid's l2: 0.858606	valid's pearson_corr: 0.107103
[150]	valid's l2: 0.857856	valid's pearson_corr: 0.110747
[200]	valid's l2: 0.857058	valid's pearson_corr: 0.114339
[250]	valid's l2: 0.856615	valid's pearson_corr: 0.116461
[300]	valid's l2: 0.856029	valid's pearson_corr: 0.119336
[350]	valid's l2: 0.855594	valid's pearson_corr: 0.121402
[400]	valid's l2: 0.85516	valid's pearson_corr: 0.123558
[450]	valid's l2: 0.854549	valid's pearson_corr: 0.126498
[500]	valid's l2: 0.854252	valid's pearson_corr: 0.127931
[550]	valid's l2: 0.854018	valid's pearson_corr: 0.129006
[600]	valid's l2: 0.853839	valid's pearson_corr: 0.129599
[650]	valid's l2: 0.853931	valid's pearso