In [1]:
import pandas as pd
from scipy import stats
import os
import gc
import numpy as np
import plotly.express as px
import time
import pickle
import time
from scipy.stats import pearsonr
from typing import Tuple
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Ridge

from joblib import dump, load

from ubq_utilities import (
    get_time_series_cross_val_splits,
    calculate_corr, corr_sharpe_lgb,
    TIME_COL, FOLD_NAME, TARGET_COL,
    STARTING_FEATURE_NAME, N_FOLD, 
    STARTING_CAT_FEAT_NAME, STARTING_NUMERIC_FEAT_NAME,
    RANDOM_STATE, SUBSAMPLE_FOLD
)

N_FOLD = 5

path_data = '../input/ubq-preprocess-mean-other'

In [2]:
with open(os.path.join(path_data, 'ALL_FEATURE.pkl'), 'rb') as file:
    ALL_FEATURE = pickle.load(file)

In [3]:
print('Importing pd df')
data = pd.read_pickle(
    os.path.join(path_data, f'train_16_fe.pkl')
)

gc.collect()

Importing pd df


64

In [4]:
gc.collect()

21

In [5]:
data['all_quantile_10'] = data['all_quantile_10'].astype('float16')
data['all_quantile_50'] = data['all_quantile_10'].astype('float16')
data['all_quantile_90'] = data['all_quantile_10'].astype('float16')

In [6]:
print(f'Using: {data.shape[0]} rows')

Using: 3141410 rows


In [7]:
fold_embargo_zip = get_time_series_cross_val_splits(data, cv=N_FOLD, embargo=50, min_time_to_use=0)

fold_split = [
    [
        np.where(data[TIME_COL].isin(train_index))[0], 
        np.where(data[TIME_COL].isin(test_index))[0]
    ]
    for train_index, test_index in fold_embargo_zip
]

time_id_split = [
    data[TIME_COL].loc[test_index].values
    for _, test_index in fold_split
]

first_element_test_split = np.array([
    data[TARGET_COL].loc[test_index].iloc[0]
    for _, test_index in fold_split
])

In [8]:
gc.collect()

21

In [9]:
progress_list = []
model_list = []



total_score = 0
for i, (train_index, test_index) in enumerate(fold_split):
        
        folder = StratifiedKFold(2, shuffle = True, random_state=i)
        
        print(f'\n\nStarting fold {i}\n\n\n')
        
        train_x_num = data[ALL_FEATURE].iloc[train_index].to_numpy('float16')
        train_y = data[TARGET_COL].iloc[train_index].to_numpy('float16')
        
        time_id = data[TIME_COL].iloc[train_index]
        
        test_x_num = data[ALL_FEATURE].iloc[test_index].to_numpy('float16')
        test_y = data[TARGET_COL].iloc[test_index].to_numpy('float16')

        for j, (train_index, test_index) in enumerate(folder.split(time_id, time_id)):
            if j == 0:
                train_x_num = train_x_num[test_index, :]
                train_y = train_y[test_index]
                    
        model = Ridge()
        model.fit(train_x_num, train_y)
        
        val_pred = model.predict(test_x_num)
        
        score_fold = pearsonr(test_y, val_pred)[0]

        print(f"Fold corr: {score_fold:.4f}\n")
        total_score += score_fold/N_FOLD
        
        model_list.append(model)
        
        dump(model, f'ridge_fold_{i}.model')
        
        del (
            train_x_num, train_y, test_x_num, test_y, model
        )
        
print(f'Final corr: {total_score:.4f}')



Starting fold 0



Fold corr: 0.1185



Starting fold 1



Fold corr: 0.1772



Starting fold 2



Fold corr: 0.1441



Starting fold 3



Fold corr: 0.1043



Starting fold 4



Fold corr: 0.1053

Final corr: 0.1299


# OOF Prediction

In [10]:
model_list = [
    load(f'ridge_fold_{i}.model') for i in range(N_FOLD)
]
oof_predictions = np.zeros(data.shape[0])

for i, (_, test_index) in enumerate(fold_split):
        model = model_list[i]
        
        test_x_num = data[ALL_FEATURE].iloc[test_index].to_numpy('float16')
        val_pred = model.predict(test_x_num)
        oof_predictions[test_index] = val_pred
        
oof_df = pd.DataFrame({'time_id': data['time_id'], 'y_true': data['target'], 'y_pred': oof_predictions})

# Save out of folds csv for blending
oof_df.to_csv('lgbm.csv', index = False)

score = calculate_corr(oof_df)[0]
print(f'Our out of folds mean pearson correlation coefficient is {score}')    

Our out of folds mean pearson correlation coefficient is 0.12154593433621116
