In [1]:
import sys
sys.path.append("/home/cyprien/Documents/github/pytorch-forecasting")

import hashlib

from data_factory.preprocessing import *
from utilities.config import load_config

from tqdm import tqdm

logging.basicConfig(level=DEBUG)
logger = logging.getLogger(__name__)
logger.setLevel(DEBUG)

In [2]:
config_file = '../config/config.yml'
use_previous_files = False
export = False

In [3]:
config = load_config(config_file)

model = config['model']
model_config = config[model]

# Create variables from config
#  data loading
save_folder = config['data']['save']
train_file = config['data']['train_path'] + config['data']['stock_prices']
test_file = config['data']['test_path'] + config['data']['stock_prices']
#  TimeSeries settings
max_prediction_length = model_config['sliding_window']['max_prediction_length']
min_prediction_length = model_config['sliding_window']['min_prediction_length']
max_encoder_length = model_config['sliding_window']['max_encoder_length']
min_encoder_length = model_config['sliding_window']['min_encoder_length']
batch_size = model_config['sliding_window']['batch_size']

related_stocks = model_config['related_stock']
train_val_split = model_config['train_val_split']
scale = model_config['manual_scale']

# define file name for saving StockPricesLoader with specific config
hash_ = hashlib.md5(model_config.__str__().encode('utf-8')).hexdigest()
export_file_name = f"{save_folder}/export_{hash_}.p"
logger.debug(f'Export file {export_file_name}')

logger.debug(f'Use config {config}')


DEBUG:__main__:Export file ../data/save//export_c812f38e2002410441e38cf0c23469e2.p
DEBUG:__main__:Use config {'device': 'cpu', 'seed': False, 'model': 'temporal_fusion_transformer', 'data': {'save': '../data/save/', 'suppl': '../data/suppl/', 'train_path': '../data/jpx-tokyo-stock-exchange-prediction/train_files/', 'test_path': '../data/jpx-tokyo-stock-exchange-prediction/supplemental_files/', 'financials': 'financials.csv', 'stock_prices': 'stock_prices.csv', 'options': 'options.csv', 'secondary_stock_price': 'secondary_stock_price.csv', 'trades': 'trades.csv', 'cosine': 'cosine_df.csv'}, 'rnn': {'sliding_window': {'max_prediction_length': 10, 'min_prediction_length': 10, 'max_encoder_length': 150, 'min_encoder_length': 150, 'batch_size': 64}, 'train_val_split': 1, 'related_stock': 2, 'manual_scale': True, 'hidden_size': 20, 'layers': 3, 'dropout': 0}, 'temporal_fusion_transformer': {'sliding_window': {'max_prediction_length': 10, 'min_prediction_length': 10, 'max_encoder_length': 150

### Load

In [4]:
from data_factory.prepared_data import PreparedData

data: PreparedData = PreparedData.from_file("../data/save/preprocessed_data.pkl")
df_train = data.train
df_val = data.val
df_test = data.test

## Data Augmentation

#### Add related stocks

In [11]:
n = related_stocks

cosine = pd.read_csv(config['data']['suppl'] + config['data']['cosine'], low_memory=False).rename(columns={'Unnamed: 0': 'ticker'})

cosine.set_index('ticker', inplace=True)
top = cosine.apply(lambda s: pd.Series(s.nlargest(related_stocks).index)).T.astype(str).rename(columns=str)
missing_keys = list(set(df_train.SecuritiesCode.unique()) - set(cosine.columns))
logger.info(f"Len missing Securities Code in {config['data']['cosine']}: {len(missing_keys)}")

top = pd.concat([top, pd.DataFrame({str(i): missing_keys for i in range(n)}, index=missing_keys)])

def add_stocks(df: pd.DataFrame):
    for t, col in tqdm([(f'top_{i}', str(i)) for i in range(n)]):
        df[t] = df.SecuritiesCode.transform(lambda x: top.loc[x, col])

        df = df.merge(
            df.loc[:, ['SecuritiesCode', 'Timestamp', 'Close', 'Close_scaled']],
            how='left', left_on=[t, 'Timestamp'], right_on=['SecuritiesCode', 'Timestamp'],
            suffixes=('', f'_{t}')
        ).drop(columns=f'SecuritiesCode_{t}')

        df[f'Close_scaled_{t}'] = df[f'Close_scaled_{t}']
    return df.fillna(value=0)

df_train = add_stocks(df_train)
df_test = add_stocks(df_test)
df_val = add_stocks(df_val)

INFO:__main__:Len missing Securities Code in cosine_df.csv: 0
100%|██████████| 5/5 [01:46<00:00, 21.32s/it]
100%|██████████| 5/5 [00:15<00:00,  3.15s/it]
100%|██████████| 5/5 [00:15<00:00,  3.09s/it]


### Export

In [12]:
data = PreparedData(df_train, df_val, df_test, data.scalers)

In [13]:
data.export('../data/save/augmented_data.pkl')