In [1]:
import pandas as pd
from scipy import stats
import os
import gc
import numpy as np
import plotly.express as px
import tensorflow as tf
import time
import tensorflow_probability as tfp
import pickle
import time
from tensorflow.keras import mixed_precision

from ubq_nn_utils import (
    PARAMS, tfp_correlation, create_ae_mlp, get_ae_dataset
)

from ubq_utilities import (
    get_time_series_cross_val_splits,
    get_hardware_strategy,
    calculate_corr, corr_sharpe_lgb,
    TIME_COL, FOLD_NAME, TARGET_COL,
    STARTING_FEATURE_NAME, N_FOLD, 
    STARTING_CAT_FEAT_NAME, STARTING_NUMERIC_FEAT_NAME,
    RANDOM_STATE, SUBSAMPLE_FOLD
)

N_ROUND = 100
N_FOLD = 5

tf.random.set_seed(RANDOM_STATE)

path_data = '../input/ubq-preprocess-mean-other'

2022-04-09 16:46:02.719550: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-04-09 16:46:02.719678: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


2022-04-09 16:46:09.540135: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-09 16:46:09.544375: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-04-09 16:46:09.544427: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-09 16:46:09.544459: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (8d1a0ce7d36a): /proc/driver/nvidia/version does not exist
2022-04-09 16:46:09.545873: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

In [2]:
tpu, strategy = get_hardware_strategy()

BATCH_SIZE = 256 * strategy.num_replicas_in_sync
AUTO = tf.data.experimental.AUTOTUNE

2022-04-09 16:46:09.647220: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-09 16:46:09.675330: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.0.0.2:8470}
2022-04-09 16:46:09.675399: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:30020}
2022-04-09 16:46:09.699974: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.0.0.2:8470}
2022-04-09 16:46:09.700049: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:30020}
2022-04-09 16:46:09.700648: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:411] Started server with target: grpc://localhost:30020


Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [3]:
with open(os.path.join(path_data, 'ALL_FEATURE.pkl'), 'rb') as file:
    ALL_FEATURE = pickle.load(file)

PARAMS['num_total_feature'] = len(ALL_FEATURE)
PARAMS['num_original_feature'] = len(ALL_FEATURE)

In [4]:
print('Importing pd df')
data = pd.read_pickle(
    os.path.join(path_data, f'train_16_fe.pkl')
)

gc.collect()

Importing pd df


64

In [5]:
print(f"Using: {data.shape[0]} rows; {PARAMS['num_total_feature']} columns")

Using: 3141410 rows; 405 columns


In [6]:
fold_embargo_zip = get_time_series_cross_val_splits(data, cv=N_FOLD, embargo=50, min_time_to_use=0)

fold_split = [
    [
        np.where(data[TIME_COL].isin(train_index))[0], 
        np.where(data[TIME_COL].isin(test_index))[0]
    ]
    for train_index, test_index in fold_embargo_zip
]

time_id_split = [
    data[TIME_COL].loc[test_index].values
    for _, test_index in fold_split
]

first_element_test_split = np.array([
    data[TARGET_COL].loc[test_index].iloc[0]
    for _, test_index in fold_split
])

In [7]:
gc.collect()

21

In [8]:
os.makedirs('./checkpoint')

In [9]:
gc.collect()

42

In [10]:
PARAMS['lr'] = 0.0008

In [11]:
tf.tpu.experimental.initialize_tpu_system(tpu)

<tensorflow.python.tpu.topology.Topology at 0x7f8750c7f250>

In [12]:
gc.collect()

63

In [13]:
PARAMS['hidden_units'] = [96, 96, 1024, 512, 256, 256, 64, 64]
PARAMS['dropout_rates'] = [0.01, 0.05, 0.4, 0.4, 0.4, 0.4, 0.4, 0.1, .1]

In [14]:
progress_list = []

compile_params = {}

for i, (train_index, test_index) in enumerate(fold_split):

        print(f'\n\nStarting fold {i}\n\n\n')

        train_x_num, train_y = (
            data[ALL_FEATURE].iloc[train_index].to_numpy('float16'),
            data[TARGET_COL].iloc[train_index].to_numpy('float16')
        )
        test_x_num, test_y = (
            data[ALL_FEATURE].iloc[test_index].to_numpy('float16'),
            data[TARGET_COL].iloc[test_index].to_numpy('float16')
        )
        STEPS_PER_EPOCH = train_x_num.shape[0]  // BATCH_SIZE

        train_dataset = get_ae_dataset(PARAMS['num_total_feature'], train_x_num, train_y, batch_size=BATCH_SIZE)
        test_dataset = get_ae_dataset(PARAMS['num_total_feature'], test_x_num, test_y, train=False, batch_size=BATCH_SIZE)
                
        gc.collect()
        time_id_fold = time_id_split[i]
        
        checkpoint_path = "checkpoint/{epoch:03d}_" + f"model_fold_{i}.hdf5"
        with strategy.scope():

            model = create_ae_mlp(
                **PARAMS, steps=STEPS_PER_EPOCH, 
                metrics={'output': tfp_correlation},
                compile_other_params = compile_params
            )
            
        model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
            checkpoint_path,
            verbose = 0, 
            save_weights_only = True
        )

        history = model.fit(
            train_dataset,
            validation_data=(test_dataset),
            epochs = N_ROUND, callbacks = [model_checkpoint],
            verbose = 0,
        )
        progress = pd.DataFrame(history.history)
                        
        del (
            train_x_num, train_y, test_x_num, test_y, model_checkpoint, 
            history, model, time_id_fold, train_dataset,
            test_dataset
        )
                
        gc.collect()
        
        progress_list.append(progress)



Starting fold 0





2022-04-09 16:47:19.250013: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2012712300 exceeds 10% of free system memory.
2022-04-09 16:47:27.366228: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2012712300 exceeds 10% of free system memory.




Starting fold 1





2022-04-09 17:05:39.046609: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1955192580 exceeds 10% of free system memory.
2022-04-09 17:05:47.008505: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1955192580 exceeds 10% of free system memory.




Starting fold 2





2022-04-09 17:23:46.945338: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1850421510 exceeds 10% of free system memory.




Starting fold 3





Starting fold 4





# save dataset and make prediction with best epoch

In [15]:
gc.collect()

21

In [16]:
progress_dict = {
        'time': range(N_ROUND),
    }
progress_dict.update(
        {
            f'corr_fold_{i}': progress_list[i]['val_output_tfp_correlation'].values
            for i in range(N_FOLD)
        }
    )
progress_dict.update(
        {
            f'loss_fold_{i}': progress_list[i]['val_output_loss'].values
            for i in range(N_FOLD)
        }
    )
progress_dict.update(
        {
            f'loss_ae_fold_{i}': progress_list[i]['val_ae_output_loss'].values
            for i in range(N_FOLD)
        }
    )


progress_df = pd.DataFrame(
    progress_dict
)

progress_df['average_corr'] = progress_df.loc[:, ['corr_fold_' in x for x in progress_df.columns]].mean(axis =1)
progress_df['average_loss'] = progress_df.loc[:, ['loss_fold_' in x for x in progress_df.columns]].mean(axis =1)
progress_df['average_ae_loss'] = progress_df.loc[:, ['loss_ae_fold_' in x for x in progress_df.columns]].mean(axis =1)

best_epoch = int(progress_df['average_corr'].argmax())
best_score = progress_df['average_corr'].max()
best_loss = progress_df.loc[best_epoch, 'average_loss']
best_ae_loss = progress_df.loc[best_epoch, 'average_ae_loss']

print(f'Best epoch: {best_epoch}, CV-Corr: {best_score:.3f}, CV-Loss: {best_loss:.3f}; CV-ae Loss: {best_ae_loss:.3f}')

best_result = {
    'best_epoch': best_epoch+1,
    'best_score': best_score,
}

with open('best_result.pkl', 'wb') as file:
    pickle.dump(best_result, file)

gc.collect()

Best epoch: 54, CV-Corr: 0.134, CV-Loss: 0.832; CV-ae Loss: 0.831


0

# OOF Prediction

In [17]:
def get_model_weights(fold, epoch, strategy):
    length_number = len(str(best_epoch))
    epoch_path = '0'*(3-length_number) + str(best_epoch+1)
    model = create_ae_mlp(
        **PARAMS, steps=STEPS_PER_EPOCH,
        metrics={'output': tfp_correlation},
    )
    model.load_weights(f"checkpoint/{epoch_path}_model_fold_{fold}.hdf5")
    return model

model_list = [
    get_model_weights(fold, best_result['best_epoch'], strategy) for fold in range(N_FOLD)
]

In [18]:
oof_predictions = np.zeros(data.shape[0])
oof_ae_predictions = np.zeros(data.shape[0])
oof_add_predictions = np.zeros(data.shape[0])

for i, (_, test_index) in enumerate(fold_split):
    print(i, end='\r')
    model = model_list[i]

    test_x_num = data[ALL_FEATURE].iloc[test_index].to_numpy('float16')
    
    val_pred = model.predict(test_x_num)
    
    val_pred_out = val_pred[-1].reshape((-1))
    val_pred_ae = val_pred[1].reshape((-1))
    
    oof_predictions[test_index] = val_pred_out
    oof_ae_predictions[test_index] = val_pred_ae
    oof_add_predictions[test_index] = (val_pred_out + val_pred_ae)/2
    
oof_df = pd.DataFrame(
    {
        'time_id': data['time_id'], 'y_true': data['target'], 
        'y_pred': oof_predictions, 'y_pred_ae': oof_ae_predictions, 'y_pred_ens': oof_add_predictions 
    }
)

# Save out of folds csv for blending
oof_df.to_csv('ae_mlp.csv', index = False)

score = calculate_corr(oof_df)[0]
score_ae = calculate_corr(oof_df, pred_col = 'y_pred_ae')[0]
score_ens = calculate_corr(oof_df, pred_col = 'y_pred_ens')[0]

print(
    f"""Our out of folds mean pearson correlation coefficient is:
Primary Model: {score:.4f}\nAutoencoder: {score_ae:.4f}\nEnsemble: {score_ens:.4f}"""
)

Our out of folds mean pearson correlation coefficient is:
Primary Model: 0.1328
Autoencoder: 0.1284
Ensemble: 0.1334


# CV score

In [19]:
progress_df.to_csv('result.csv', index=False)

In [20]:
for col in ['average_corr', 'average_loss', 'average_ae_loss']:
    fig = px.line(
        progress_df, x="time", y=col, 
        title=col, template='plotly_white'
    )

    fig.show()