In [1]:
import pandas as pd
from scipy import stats
import os
import gc
import numpy as np
import plotly.express as px
import time
import lightgbm as lgb
import pickle
import time
from scipy.stats import pearsonr
from typing import Tuple

from ubq_utilities import (
    get_time_series_cross_val_splits,
    calculate_corr, corr_sharpe_lgb,
    TIME_COL, FOLD_NAME, TARGET_COL,
    STARTING_FEATURE_NAME, N_FOLD, 
    STARTING_CAT_FEAT_NAME, STARTING_NUMERIC_FEAT_NAME,
    RANDOM_STATE, SUBSAMPLE_FOLD
)

N_ROUND = 749
N_FOLD = 5

path_data = '../input/ubq-preprocess-mean-other'
path_model = '../input/../input/ubq-lgb-f16-feat-other-dart-fold-'

In [2]:
with open(os.path.join(path_data, 'ALL_FEATURE.pkl'), 'rb') as file:
    ALL_FEATURE = pickle.load(file)

In [3]:
print('Importing pd df')
data = pd.read_pickle(
    os.path.join(path_data, f'train_16_fe.pkl')
)

gc.collect()

Importing pd df


64

In [4]:
print(f'Using: {data.shape[0]} rows')

Using: 3141410 rows


In [5]:
fold_embargo_zip = get_time_series_cross_val_splits(data, cv=N_FOLD, embargo=50, min_time_to_use=0)

fold_split = [
    [
        np.where(data[TIME_COL].isin(train_index))[0], 
        np.where(data[TIME_COL].isin(test_index))[0]
    ]
    for train_index, test_index in fold_embargo_zip
]

time_id_split = [
    data[TIME_COL].loc[test_index].values
    for _, test_index in fold_split
]

first_element_test_split = np.array([
    data[TARGET_COL].loc[test_index].iloc[0]
    for _, test_index in fold_split
])

In [6]:
progress_list = []

for fold in range(N_FOLD):
    with open(os.path.join(path_model + str(fold), f'progress_fold_{fold}'), 'rb') as file:
        progress_list.append(pickle.load(file))

# save dataset and make prediction with best epoch

In [7]:
gc.collect()

21

In [8]:
progress_dict = {
        'time': range(N_ROUND),
    }
progress_dict.update(
        {
            f'corr_fold_{i}': progress_list[i]['valid']['pearson_corr']
            for i in range(N_FOLD)
        }
    )
progress_dict.update(
        {
            f'loss_fold_{i}': progress_list[i]['valid']['l2']
            for i in range(N_FOLD)
        }
    )

progress_df = pd.DataFrame(
    progress_dict
)

progress_df['average_corr'] = progress_df.loc[:, ['corr' in x for x in progress_df.columns]].mean(axis =1)
progress_df['average_loss'] = progress_df.loc[:, ['loss' in x for x in progress_df.columns]].mean(axis =1)

best_epoch = int(progress_df['average_corr'].argmax())
best_score = progress_df['average_corr'].max()
best_loss = progress_df.loc[best_epoch, 'average_loss']

print(f'Best epoch: {best_epoch}, CV-Corr: {best_score:.3f}, CV-Loss: {best_loss:.3f}')

best_result = {
    'best_epoch': best_epoch+1,
    'best_score': best_score
}

with open('best_result.pkl', 'wb') as file:
    pickle.dump(best_result, file)

gc.collect()

Best epoch: 743, CV-Corr: 0.142, CV-Loss: 0.829


0

# OOF Prediction

In [9]:
model_list = [
    lgb.Booster(model_file=os.path.join(path_model + str(i),  f'lgb_{i}.txt')) for i in range(N_FOLD)
]
oof_predictions = np.zeros(data.shape[0])

for i, (_, test_index) in enumerate(fold_split):
        model = model_list[i]
        model.save_model(f'lgb_fold_{i}.txt')
        
        test_x_num = data[ALL_FEATURE].iloc[test_index].to_numpy('float16')
        val_pred = model.predict(test_x_num)
        oof_predictions[test_index] = val_pred
        
oof_df = pd.DataFrame({'time_id': data['time_id'], 'y_true': data['target'], 'y_pred': oof_predictions})

# Save out of folds csv for blending
oof_df.to_csv('lgbm.csv', index = False)

score = calculate_corr(oof_df)[0]
print(f'Our out of folds mean pearson correlation coefficient is {score}')    

Our out of folds mean pearson correlation coefficient is 0.13745289027840765


# CV score

In [10]:
progress_df.to_csv('result.csv', index=False)

In [11]:
for col in ['average_loss', 'average_corr']:
    fig = px.line(
        progress_df, x="time", y=col, 
        title=col, template='plotly_white'
    )

    fig.show()