In [1]:
import pandas as pd
import numpy as np
import os
import gc
import pickle
import lightgbm as lgb
import re
from typing import Tuple
from scipy.stats import pearsonr
import multiprocessing as mp
import ubiquant

from ubq_utilities import (
    TIME_COL,
    STARTING_FEATURE_NAME, TARGET_COL, STARTING_NUMERIC_FEAT_NAME,
    RANDOM_STATE, get_time_series_cross_val_splits
)

TOP_N=100
N_ROUND = 1200
N_FOLD = 5

path_data = '../input/ubiquant-market-prediction'
path_old_data = '../input/ubq-import-parquet-online'

env = ubiquant.make_env()   # initialize the environment

In [2]:
def get_time_to_corr(data):
    size_per_time = data.groupby(TIME_COL)[['f_0']].count().reset_index()
    size_per_time.sort_values(TIME_COL, ascending=False, inplace=True)
    size_per_time = size_per_time[size_per_time['f_0'] > 31].reset_index(drop=True)
    
    max_time_used = min(1000, size_per_time.shape[0])
    time_to_corr = size_per_time.loc[:max_time_used, TIME_COL].tolist()
    return time_to_corr

def get_top_n_corr(data):
    time_to_corr = get_time_to_corr(data)
    
    data = data.loc[data[TIME_COL].isin(time_to_corr), :].reset_index(drop=True)
    
    all_feature_corrs_split = data.groupby(TIME_COL).apply(
            lambda x: x[STARTING_NUMERIC_FEAT_NAME].corrwith(x[TARGET_COL])
    )

    #find best corr sharp feature with target on time id
    sorted_corr_split = all_feature_corrs_split.mean().abs()
    sorted_corr_split.sort_values(ascending = False, inplace = True)

    best_feature=sorted_corr_split.index[:TOP_N]
    return best_feature


In [3]:
def preprocess_train_dataset(data, best_feature):
    
    time_features=[]
    for i, col in enumerate(best_feature):
        print(i, end="\r")
        mapper_mean = data.groupby(['time_id'])[col].mean().to_dict()
        data[f'time_id_{col}_mean'] = data['time_id'].map(mapper_mean)
        data[f'time_id_{col}_mean'] = data[f'time_id_{col}_mean'].astype(np.float16)
        time_features.append(f'time_id_{col}_mean')

    gc.collect()
    
    aggregate_features = ['all_mean', 'all_std', 'all_quantile_10', 'all_quantile_50', 'all_quantile_90']
    
    return data, time_features, aggregate_features

In [4]:
def get_flat_features(data):
    data['all_mean'] = data[STARTING_NUMERIC_FEAT_NAME].mean(axis=1).astype(np.float16)
    data['all_std'] = data[STARTING_NUMERIC_FEAT_NAME].std(axis=1).astype(np.float16)
    data['all_quantile_10'] = data[STARTING_NUMERIC_FEAT_NAME].quantile(q=0.1, axis=1).astype(np.float16)
    data['all_quantile_50'] = data[STARTING_NUMERIC_FEAT_NAME].quantile(q=0.5, axis=1).astype(np.float16)
    data['all_quantile_90'] = data[STARTING_NUMERIC_FEAT_NAME].quantile(q=0.9, axis=1).astype(np.float16)

    return data

In [5]:
def change_type_mapping(data, mapping):
    for col in data.columns:
        data[col] = data[col].astype(mapping[col])
    
    return data

def get_fold(data, embargo=50, cv=N_FOLD, min_time_to_use=0, percent_split=True):
    
    fold_embargo_zip = get_time_series_cross_val_splits(
        data, cv=cv, embargo=embargo, min_time_to_use=min_time_to_use,
        percent_split=percent_split
    )

    fold_split = [
        [
            np.where(data[TIME_COL].isin(train_index))[0], 
            np.where(data[TIME_COL].isin(test_index))[0]
        ]
        for train_index, test_index in fold_embargo_zip
    ]
    
    return fold_split

def save_base_dataset(top_n=100):
        
    #get dtype mapping
    column_training = pd.read_csv(os.path.join(path_data, 'train.csv'), nrows = 1).columns
    dtype_mapping = {
        x: 'int32'
            if x == TIME_COL 
        else 'float16' 
        for x in column_training
        if x not in ('investment_id', 'row_id')
    }
    
    print('Import online\n')
    #read online file
    online_train = pd.read_csv(
        os.path.join(path_data, 'supplemental_train.csv'), 
        usecols = dtype_mapping.keys(), dtype = dtype_mapping,
        chunksize=50000
    )
    
    print('Saving chunks\n')
    count_file = 0
    for i, online_df in enumerate(online_train):
        print(i, end="\r")
        online_df = get_flat_features(online_df)
        online_df.to_pickle(f'online_df_{i}.pkl')
        count_file +=1
        del online_df
        gc.collect()
    
    print('Appending chunks\n')
    for file in range(count_file):
        s_train = pd.read_pickle(f'online_df_{file}.pkl')
        if file == 0:
            online_train = s_train
        else:
            online_train=pd.concat([online_train,s_train])
        del s_train
        !rm online_df_{file}.pkl
    
    gc.collect()

    gc.collect()
    print('Reading old train\n')
    train = pd.read_pickle(os.path.join(path_old_data, 'train.pkl'))
    
    print('Append all\n')
    train = train.append(online_train)
    train.reset_index(drop=True, inplace=True)
    
    del online_train
    gc.collect()

    print('get top n\n')
    best_feature = get_top_n_corr(train)
    
    gc.collect()
  
    print('preprocess\n')
    train, time_features, aggregate_features = preprocess_train_dataset(train, best_feature)
    gc.collect()

    used_feature = STARTING_NUMERIC_FEAT_NAME + time_features + aggregate_features
    
    train.to_pickle('appended.pkl')
    
    del train
    gc.collect()
    
    with open('used_feature.pkl', 'wb') as file:
        pickle.dump(used_feature, file)
        
def get_dataset(top_n=100):
    with open('used_feature.pkl', 'rb') as file:
        used_feature = pickle.load(file)
        
    train = pd.read_pickle('appended.pkl')
    !rm appended.pkl
    
    fold_split = get_fold(train)
    
    for i, (train_index, test_index) in enumerate(fold_split):
        print(f'Train shape: {len(train_index)}; Test shape: {len(test_index)}')
        print(f'Saving np dataset {i}', end='\r')
        
        train_x_num = train[used_feature].iloc[train_index].to_numpy('float16')
        train_y = train[TARGET_COL].iloc[train_index].to_numpy('float16')
        
        gc.collect()
        # save that dataset to a file
       
        with open(f'train_x_{i}.npy', 'wb') as file:
            np.save(file, train_x_num)
            
        with open(f'train_y_{i}.npy', 'wb') as file:
            np.save(file, train_y)
                
        del train_x_num, train_y
        gc.collect()
        
        test_x_num = train[used_feature].iloc[test_index].to_numpy('float16')
        test_y = train[TARGET_COL].iloc[test_index].to_numpy('float16')
        
        gc.collect()

        with open(f'valid_x_{i}.npy', 'wb') as file:
            np.save(file, test_x_num)
            
        with open(f'valid_y_{i}.npy', 'wb') as file:
            np.save(file, test_y)

        del test_x_num, test_y
        gc.collect()

In [6]:
#spawn process to realase memory at the end
proc = mp.Process(target=save_base_dataset)
proc.start()
proc.join()

gc.collect()

Import online

Saving chunks

Appending chunks

Reading old train

Append all

get top n

preprocess

98

  import sys


99

43

In [7]:
#spawn process to realase memory at the end
proc = mp.Process(target=get_dataset)
proc.start()
proc.join()

gc.collect()

Train shape: 3173178; Test shape: 817600
Train shape: 2996425; Test shape: 818756
Train shape: 2950713; Test shape: 822160
Train shape: 2918437; Test shape: 820871
Train shape: 3088295; Test shape: 817337
Saving np dataset 4

21

In [8]:
with open('used_feature.pkl', 'rb') as file:
    used_feature = pickle.load(file)

In [9]:
time_col_list = [re.search('time_id_(.*)_mean', x).group(1) for x in used_feature if 'time_id_' in x]

In [10]:
gc.collect()

63

In [11]:
def inference_preprocess(data, time_col_list=time_col_list):
    data['time_id'] = data['row_id'].str[0:4].astype(np.int32)
    for col in time_col_list:
        mapper_mean = data.groupby(['time_id'])[col].mean().to_dict()
        data[f'time_id_{col}_mean'] = data['time_id'].map(mapper_mean)
        data[f'time_id_{col}_mean'] = data[f'time_id_{col}_mean'].astype(np.float16)
    
    data['all_mean'] = data[STARTING_NUMERIC_FEAT_NAME].mean(axis=1).astype(np.float16)
    data['all_std'] = data[STARTING_NUMERIC_FEAT_NAME].std(axis=1).astype(np.float16)
    data['all_quantile_10'] = data[STARTING_NUMERIC_FEAT_NAME].quantile(q=0.1, axis=1).astype(np.float16)
    data['all_quantile_50'] = data[STARTING_NUMERIC_FEAT_NAME].quantile(q=0.5, axis=1).astype(np.float16)
    data['all_quantile_90'] = data[STARTING_NUMERIC_FEAT_NAME].quantile(q=0.9, axis=1).astype(np.float16)

    return data


In [12]:
PARAMS_LGB = {
    'tree_learner': 'voting',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'n_jobs': -1,
    'num_leaves': 2**8,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_freq': 5,
    'bagging_fraction': 0.80,
    'lambda_l2': 1,
    'verbosity': -1
}


In [13]:
def corr_sharpe_lgb(
    y_pred: np.array, dtrain: lgb.Dataset, 
) -> Tuple[str, float, bool]:
    """
    Pearson correlation coefficient metric
    """
    y_true = dtrain.get_label()

    corr = pearsonr(y_true, y_pred)[0]
    return 'pearson_corr', corr, True

def train_lgb(params = PARAMS_LGB):
    model_list = []
    progress_list = []
    
    for i in range(N_FOLD):
        print(f'\n\nStarting fold {i}\n\n\n')

        train_x_num, train_y = np.load(f'train_x_{i}.npy'), np.load(f'train_y_{i}.npy')
        test_x_num, test_y = np.load(f'valid_x_{i}.npy'), np.load(f'valid_y_{i}.npy')
        
        train_matrix = lgb.Dataset(train_x_num, train_y)
        test_matrix = lgb.Dataset(test_x_num, test_y)

        del train_x_num, train_y, test_x_num, test_y

        gc.collect()

        progress = {}

        callbacks_list = [
            lgb.record_evaluation(progress),
            lgb.log_evaluation(period=50, show_stdv=False)
        ]

        model = lgb.train(
                    params=PARAMS_LGB,
                    train_set=train_matrix, 
                    num_boost_round=N_ROUND,
                    valid_sets=[test_matrix],
                    valid_names=['valid'],
                    callbacks=callbacks_list,
                    feval=corr_sharpe_lgb,
                )
        
        model.save_model(f'lgb_{i}.txt')

        model_list.append(model)
        progress_list.append(progress)

        del (
            train_matrix, test_matrix
        )
        gc.collect()
    
    with open('model_list_lgb.pkl', 'wb') as file:
        pickle.dump(model_list, file)

    with open('progress_list_lgb.pkl', 'wb') as file:
        pickle.dump(progress_list, file)

In [14]:
gc.collect()

21

In [15]:
#spawn process to realase memory at the end
proc = mp.Process(target=train_lgb)
proc.start()
proc.join()

gc.collect()



Starting fold 0



[50]	valid's l2: 0.763786	valid's pearson_corr: 0.219715
[100]	valid's l2: 0.762636	valid's pearson_corr: 0.223352
[150]	valid's l2: 0.762338	valid's pearson_corr: 0.224723
[200]	valid's l2: 0.762293	valid's pearson_corr: 0.225506
[250]	valid's l2: 0.762593	valid's pearson_corr: 0.225455
[300]	valid's l2: 0.76283	valid's pearson_corr: 0.225284
[350]	valid's l2: 0.762957	valid's pearson_corr: 0.225177
[400]	valid's l2: 0.763232	valid's pearson_corr: 0.224926
[450]	valid's l2: 0.763644	valid's pearson_corr: 0.224203
[500]	valid's l2: 0.763755	valid's pearson_corr: 0.224196
[550]	valid's l2: 0.76405	valid's pearson_corr: 0.223733
[600]	valid's l2: 0.764196	valid's pearson_corr: 0.223619
[650]	valid's l2: 0.764406	valid's pearson_corr: 0.223316
[700]	valid's l2: 0.764531	valid's pearson_corr: 0.223148
[750]	valid's l2: 0.764691	valid's pearson_corr: 0.223038
[800]	valid's l2: 0.764906	valid's pearson_corr: 0.222671
[850]	valid's l2: 0.765048	valid's pearson_corr: 0.222

21

# Find best epoch

In [16]:
with open('model_list_lgb.pkl', 'rb') as file:
    model_list_lgb = pickle.load(file)

with open('progress_list_lgb.pkl', 'rb') as file:
    progress_list_lgb = pickle.load(file)

In [17]:
progress_dict_lgb = {
        'time': range(N_ROUND),
    }
progress_dict_lgb.update(
        {
            f'corr_fold_{i}': progress_list_lgb[i]['valid']['pearson_corr']
            for i in range(N_FOLD)
        }
    )

progress_df_lgb = pd.DataFrame(
    progress_dict_lgb
)

progress_df_lgb['average_corr'] = progress_df_lgb.loc[:, ['corr' in x for x in progress_df_lgb.columns]].mean(axis =1)

best_epoch_lgb = int(progress_df_lgb['average_corr'].argmax())
best_score_lgb = progress_df_lgb['average_corr'].max()

print(f'Best epoch: {best_epoch_lgb}, CV-Corr: {best_score_lgb:.3f}')

best_result_lgb = {
    'best_epoch': best_epoch_lgb+1,
    'best_score': best_score_lgb
}

gc.collect()


Best epoch: 209, CV-Corr: 0.255


0

In [18]:
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    test_values = inference_preprocess(test_df)[used_feature].to_numpy(dtype='float16')
    
    pred = [
        np.reshape(model.predict(test_values, num_iteration = best_result_lgb['best_epoch']), (-1)) for model in model_list_lgb
    ]

    pred_blend = np.stack(pred).mean(axis=0)


    sample_prediction_df['target'] = pred_blend  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


  """
