- Reference:[https://www.kaggle.com/yamqwe/time-series-modeling-lstm](https://www.kaggle.com/yamqwe/time-series-modeling-lstm)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
##data and date
dat = pd.read_csv('./data/train.csv')
dat = dat.set_index('timestamp')
info = pd.read_csv('./data/asset_details.csv')
assets_names = dict(zip(info.Asset_ID, info.Asset_Name))
dat['Asset_name'] = dat.Asset_ID.map(assets_names)
dat['asset_count'] = 1
#should be 14 for each timestamp when consistant
dat['asset_count'] = dat.groupby(by = dat.index)['asset_count'].sum()
dat.head()
all_same_time = dat[dat['asset_count'] == 14]
all_same_time = all_same_time.drop('asset_count',axis=1)
all_same_time.head()
## Target correlation map
corr_target = all_same_time.reset_index().pivot(index = 'Asset_name', columns = 'timestamp')['Target'].transpose().corr()
fig, ax = plt.subplots(figsize = (20, 8))
sns.heatmap(np.round(corr_target, 2), annot = True, ax = ax, square = True)
del all_same_time
del dat
gc.collect()

## LSTM model

## configure

In [None]:
import os
import gc
import traceback
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

pd.set_option('display.max_columns', None)
from script.LSTM.configure import *


### data loading

- choose which years to load

In [None]:
from script.LSTM.dataload import load_data_for_all_assets

take asset order into record

In [None]:
train = load_data_for_all_assets(load_jay = True, includeextra=True)
train = train.sort_values('timestamp').set_index("timestamp")
if DEBUG: train = train[10000000:]

test = pd.read_csv('./data/' + 'example_test.csv')
#sample_prediction_df = pd.read_csv('./data/' + 'example_sample_submission.csv')

assets = pd.read_csv('./data/asset_details.csv')
assets_names = dict(zip(assets.Asset_ID, assets.Asset_Name))
train['Asset_Name'] = train.Asset_ID.map(assets_names)
assets_weight = dict(zip(assets.Asset_ID, assets.Weight))
train['Weight'] = train.Asset_ID.map(assets_weight)

#the order of the cryptos collected at same timestamp.
assets_order = pd.read_csv('./data/supplemental_train.csv').Asset_ID[:N_ASSETS]
assets_order = dict((t,i) for i,t in enumerate(assets_order))
print(f"Loaded all data range {train.index.values.astype('datetime64[s]')[[0,-1]]}")

## feature encoding

- Upper_Shadow, Lower_Shadow,spread,mean_trade,log_price_change

In [None]:
from script.LSTM.encodefeature import get_features
train['Target'] = train['Target'].fillna(0)
VWAP_max = np.max(train[np.isfinite(train.VWAP)].VWAP)
VWAP_min = np.min(train[np.isfinite(train.VWAP)].VWAP)
train['VWAP'] = np.nan_to_num(train.VWAP, posinf=VWAP_max, neginf=VWAP_min)

df = train[['Asset_ID', 'Target']].copy()
times = dict((t,i) for i,t in enumerate(df.index.unique()))
df['id'] = df.index.map(times)
df['id'] = df['id'].astype(str) + '_' + df['Asset_ID'].astype(str)
ids = df.id.copy()
del df

train = get_features(train)
train_features = [i for i in train.columns if i not in ['Target', 'date', 'timestamp', 'Asset_ID', 'groups']]

### missing data

In [None]:
##filling missing data
train = train.sort_index()
ind = train.index.unique()

def reindex(df):
    df = df.reindex(range(ind[0],ind[-1]+60,60),method='nearest')
    df = df.fillna(method="ffill").fillna(method="bfill")
    return df

train = train.groupby('Asset_ID').apply(reindex).reset_index(0, drop=True).sort_index()
gc.collect()
train.shape

In [None]:
# Matching records and marking generated rows as 'non-real' for masking layer
train['group_num'] = train.index.map(times)
train = train.dropna(subset=['group_num'])
train['group_num'] = train['group_num'].astype('int')
train['id'] = train['group_num'].astype(str) + '_' + train['Asset_ID'].astype(str)
train['is_real'] = train.id.isin(ids) * 1
train = train.drop('id', axis=1)

# Features values for 'non-real' rows are set to zeros
features = train.columns.drop(['Asset_ID','group_num','is_real'])
train.loc[train.is_real == 0, features] = 0.

In [None]:
## sort the data by time and asset_id_order
from script.LSTM.dataload import reduce_mem_usage
train['asset_order'] = train.Asset_ID.map(assets_order)
train = train.sort_values(by=['group_num', 'asset_order'])
train = reduce_mem_usage(train)
gc.collect()

## 3D arrays

In [None]:
#make 3D-numpy arrays for train and targets
targets = train['Target'].to_numpy().reshape(-1, N_ASSETS)
features = train.columns.drop(['Asset_ID', 'Asset_Name','Weight', 'Target', 'group_num', 'is_real', 'date'])
train = train[features]

In [None]:
train = train.values
train = train.reshape(-1, N_ASSETS, train.shape[-1])

X_train, X_test = train[:-len(train)//PCT_VALIDATION], train[-len(train)//PCT_VALIDATION:]
y_train, y_test = targets[:-len(train)//PCT_VALIDATION], targets[-len(train)//PCT_VALIDATION:]

### dataset windowing

Samples with a duration of `WINDOW_SIZE` records (minutes) will be formed from the train array. Each sample has a target vector corresponding to the final index if `WINDOW_SIZE` record.

In [None]:
from script.LSTM.datasetwindow import sample_generator
train_generator = sample_generator(X_train, y_train, length = WINDOW_SIZE, batch_size = BATCH_SIZE)
val_generator = sample_generator(X_test, y_test, length = WINDOW_SIZE, batch_size = BATCH_SIZE)
print(f'Sample shape: {train_generator[0][0].shape}')#batch_0, window, n_assets, features
print(f'Target shape: {train_generator[0][1].shape}')#batch_0, N_ASSETS

## Compiling model

Our model will be trained for the number of `FOLDS` and `EPOCHS` you chose in the configuration above. Each fold the model with lowest validation loss will be saved and used to predict `OOF`(Out of fold) and test. Adjust the variable `VERBOSE`. The variable `VERBOSE=1 or 2` will display the training and validation loss for each epoch as text.

In [None]:
from script.LSTM.trainmodel import get_model

model = get_model(n_assets=14,
                trainshape=(train_generator[0][0].shape[1], 14, train_generator[0][0].shape[-1]))
model.summary()

In [None]:
tf.keras.utils.plot_model(get_model(n_assets=3), show_shapes=True,to_file='./pic/lstm-kaggle.png')

## Fitting model

In [None]:
print(features)

tf.random.set_seed(0)
estop = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 7, verbose = 0, mode = 'min',restore_best_weights = True)
scheduler = keras.optimizers.schedules.ExponentialDecay(1e-3, (0.5 * len(X_train) / BATCH_SIZE), 1e-3)
lr = keras.callbacks.LearningRateScheduler(scheduler, verbose = 1)

history = model.fit(train_generator, validation_data = (val_generator), 
                    epochs = EPOCHS, callbacks = [lr, estop])
model.save("kaggleLSTM")

In [None]:
'''from script.LSTM.trainmodel import *
model = keras.models.load_model("kaggleLSTM",custom_objects={"masked_cosine": masked_cosine, 
                                                    'Correlation':Correlation})'''

## Evaluation

In [None]:

fig, ax = plt.subplots(1, 2, figsize=(16, 8))
histories = pd.DataFrame(history.history)
epochs = list(range(1,len(histories)+1))
loss = histories['loss']
val_loss = histories['val_loss']
Correlation = histories['Correlation']
val_Correlation = histories['val_Correlation']
ax[0].plot(epochs, loss, label = 'Train Loss')
ax[0].plot(epochs, val_loss, label = 'Val Loss')
ax[0].set_title('Losses')
ax[0].set_xlabel('Epoch')
ax[0].legend(loc='upper right')
ax[1].plot(epochs, Correlation, label = 'Train Correlation')
ax[1].plot(epochs, val_Correlation, label = 'Val Correlation')
ax[1].set_title('Correlations')
ax[1].set_xlabel('Epoch')
ax[1].legend(loc='upper right')
fig.show()
gc.collect()

In [None]:
# The correlation coefficients by asset for the validation data
predictions = model.predict(val_generator)

print('Asset:    Corr. coef.')
print('---------------------')
for i in range(N_ASSETS):
    # drop first 14 values in the y_test, since they are absent in val_generator labels
    y_true = np.squeeze(y_test[WINDOW_SIZE - 1:, i])
    y_pred = np.squeeze(predictions[:, i])
    real_target_ind = np.argwhere(y_true!=0)
    asset_id = list(assets_order.keys())[i]
    asset_name = assets[assets.Asset_ID == asset_id]['Asset_Name'].item()
    print(f"{asset_name}: {np.corrcoef(y_pred[real_target_ind].flatten(), y_true[real_target_ind].flatten())[0,1]:.4f}")

# Submission

[https://www.kaggle.com/c/g-research-crypto-forecasting/overview/evaluation](https://www.kaggle.com/c/g-research-crypto-forecasting/overview/evaluation)

In [None]:
#crypto order at same timestamp in the aftergame train data
data_path = './data'
sup_train=pd.read_csv(data_path+'/supplemental_train.csv')
assets_order = sup_train.Asset_ID[:N_ASSETS]
assets_order = dict((t,i) for i,t in enumerate(assets_order))
features = ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP',
       'Upper_Shadow', 'Lower_Shadow', 'spread', 'mean_trade',
       'log_price_change', 'asset_order']

In [None]:
#reorder placeholder(supplemental_train.csv) for first 15mins samples(15*14=210 rows)
sup = sup_train[:WINDOW_SIZE * (N_ASSETS)]
placeholder = get_features(sup)
placeholder['asset_order'] = placeholder.Asset_ID.map(assets_order)

# shape into our model's input array
test_sample = np.array(placeholder[features])
test_sample = test_sample.reshape(-1, (N_ASSETS), test_sample.shape[-1])
test_sample = np.expand_dims(test_sample, axis=0)

In [None]:
example = pd.read_csv(data_path+'/example_test.csv')[:WINDOW_SIZE - 1]
example['asset_order'] = example.Asset_ID.map(assets_order) 
#test data's asset order
example = example[['Asset_ID','asset_order']]

In [None]:
for i in range(1):
    #one minute test sample
    test_csv,sample_prediction_df_csv = f'./data/temp/{i}test_df.csv', f'./data/temp/{i}sample_prediction_df.csv'
    test_df, sample_prediction_df= pd.read_csv(test_csv,usecols =range(1,11)),pd.read_csv(sample_prediction_df_csv,usecols =range(1,3))
    ##add features
    test_df = get_features(test_df)
    ##sort with asset_order
    test_data = test_df.merge(example, how='outer', on='Asset_ID').sort_values('asset_order')
    # print(test_data.timestamp)
    # print(test_data[features].shape)
    ##make into arrays
    test = np.array(test_data[features].fillna(0))
    print(test.shape)
    test = test.reshape(-1, 1, N_ASSETS, test.shape[-1])#add two more axis
    print(test.shape)
    ##merge with test_sample make 15 minute sample as input to the model
    test_input = np.hstack([test_sample, test])[:,-1 * WINDOW_SIZE:]
    print(test_input.shape)

    # y_pred = model.predict(test_input).squeeze().reshape(-1, 1).squeeze()
    # test_data['Target'] = y_pred
    # for _, row in test_df.iterrows():
    #     try: sample_prediction_df.loc[sample_prediction_df['row_id'] == row['row_id'], 'Target'] = test_data.loc[test_data['row_id'] == row['row_id'], 'Target'].item()
    #     except: sample_prediction_df.loc[sample_prediction_df['row_id'] == row['row_id'], 'Target'] = 0
    # env.predict(sample_prediction_df)

## fake score

In [None]:
data_path = './data'
sup_train=pd.read_csv(data_path+'/supplemental_train.csv')#fake score source

In [None]:

for i in range(4):
    print(i)
    #one minute test sample
    test_csv,sample_prediction_df_csv = f'./data/temp/{i}test_df.csv', f'./data/temp/{i}sample_prediction_df.csv'
    test_df, sample_prediction_df= pd.read_csv(test_csv,usecols =range(1,11)),pd.read_csv(sample_prediction_df_csv,usecols =range(1,3))
    pred =list(sup_train['Target'][range(i*14,i*14+14)])
    pred[3:6] = [0]*3
    test_df['Target'] = [round(p,5) for p in pred]
    for _, row in test_df.iterrows():
        sample_prediction_df.loc[sample_prediction_df['row_id'] == row['row_id'], 'Target'] = test_df.loc[test_df['row_id'] == row['row_id'], 'Target'].item()