# Setup

### Import relevant packages

In [1]:
# general packages
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os

In [2]:
# packages for importing the cofig variables from the YAML file
import yaml

In [3]:
# packages for playing audio file
from IPython.display import Audio

### Configure max rows and max columns

In [4]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

### Load in the '.py' file containg the functions used in this notebook

In [5]:
%load_ext autoreload
%autoreload 1
%aimport rf_vif_and_if_feat_select

import rf_vif_and_if_feat_select as fs

# ---------------------------------------------------------------------------------------------------------------

# Read in the variables from the YAML config file

In [6]:
# read in YAML configuration file
with open("../../Config_files/config.yaml", "r") as variables:
    config_variables = yaml.load(variables, Loader=yaml.FullLoader)

#### Read in the data directory filepath

In [7]:
data_directory = config_variables["data_directory"]

#### Read in the path to the audio file

In [8]:
audio_file_path = config_variables["audio_file_path"]

#### Read in the list of analysis types

In [9]:
types_of_analysis = config_variables["types_of_analysis"]

#### Read in the data shift periods

In [10]:
shift_periods_in_days_list = config_variables["shift_periods_in_days_list"]

#### Read the time intervals

In [11]:
train_intervals_list = config_variables["train_intervals_list"]

#### Read in the random seed

In [12]:
random_seed = config_variables["random_seed"]

# ---------------------------------------------------------------------------------------------------------------

# Read in the processed data

In [13]:
processed_data = pd.read_csv(os.path.join(data_directory, 'processed_data.csv'))

In [14]:
processed_data.shape

(4470, 1214)

In [15]:
processed_data.head()

Unnamed: 0,date,trading_volume,rank,trades_per_minute,volatility,bid_and_ask_spread,bid_and_ask_sum_asks,bid_and_ask_sum_bids,time_between_blocks,block_size_votes,marketcap3sma,marketcap7sma,marketcap14sma,marketcap30sma,marketcap90sma,marketcap3ema,marketcap7ema,marketcap14ema,marketcap30ema,marketcap90ema,marketcap3wma,marketcap7wma,marketcap14wma,marketcap30wma,marketcap90wma,marketcap3trx,marketcap7trx,marketcap14trx,marketcap30trx,marketcap90trx,marketcap3mom,marketcap7mom,marketcap14mom,marketcap30mom,marketcap90mom,marketcap3std,marketcap7std,marketcap14std,marketcap30std,marketcap90std,marketcap3var,marketcap7var,marketcap14var,marketcap30var,marketcap90var,marketcap3rsi,marketcap7rsi,marketcap14rsi,marketcap30rsi,marketcap90rsi,...,eur_jpy_exchange_open,eur_jpy_exchange_high,eur_jpy_exchange_low,eur_jpy_exchange_close,gbp_jyp_exchange_open,gbp_jyp_exchange_high,gbp_jyp_exchange_low,gbp_jyp_exchange_close,usd_gbp_exchange_open,usd_gbp_exchange_high,usd_gbp_exchange_low,usd_gbp_exchange_close,usd_eur_exchange_open,usd_eur_exchange_high,usd_eur_exchange_low,usd_eur_exchange_close,usd_cad_exchange_open,usd_cad_exchange_high,usd_cad_exchange_low,usd_cad_exchange_close,usd_aud_exchange_open,usd_aud_exchange_high,usd_aud_exchange_low,usd_aud_exchange_close,usd_mxn_exchange_open,usd_mxn_exchange_high,usd_mxn_exchange_low,usd_mxn_exchange_close,usd_hkd_exchange_open,usd_hkd_exchange_high,usd_hkd_exchange_low,usd_hkd_exchange_close,usd_jpy_exchange_open,usd_jpy_exchange_high,usd_jpy_exchange_low,usd_jpy_exchange_close,shifted_0,binary_price_change_0,shifted_1,binary_price_change_1,shifted_7,binary_price_change_7,shifted_14,binary_price_change_14,shifted_30,binary_price_change_30,shifted_60,binary_price_change_60,shifted_90,binary_price_change_90
0,2009-01-03,14378.975391,1188.702997,0.0125,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,-0.064,-0.19,-0.075,0.295,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,113.379997,122.25,115.949997,126.349998,130.100006,146.966003,119.239998,128.350006,0.6328,0.6241,0.7622,0.61812,0.7485,0.7585,0.891266,0.69575,1.0192,0.9815,0.9799,0.9926,1.0778,1.0805,1.0997,1.0778,13.0,13.0,11.591,13.0,7.7499,7.75,7.7491,7.7498,76.779999,77.129997,101.720001,76.860001,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1
1,2009-01-04,14378.975391,1188.702997,0.0125,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,-0.064,-0.19,-0.075,0.295,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,113.379997,122.25,115.949997,126.349998,130.100006,146.966003,119.239998,128.350006,0.6328,0.6241,0.7622,0.61812,0.7485,0.7585,0.891266,0.69575,1.0192,0.9815,0.9799,0.9926,1.0778,1.0805,1.0997,1.0778,13.0,13.0,11.591,13.0,7.7499,7.75,7.7491,7.7498,76.779999,77.129997,101.720001,76.860001,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1
2,2009-01-05,14378.975391,1188.702997,0.0125,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,-0.064,-0.19,-0.075,0.295,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,128.460007,128.460007,126.417999,127.110001,133.639999,137.529999,132.919998,137.139999,0.68874,0.69252,0.67849,0.67953,0.71644,0.73768,0.71644,0.73282,1.2081,1.2213,1.1866,1.188,1.3974,1.4151,1.39,1.4,13.7726,13.7742,13.42,13.4275,7.7499,7.7557,7.743,7.7509,92.050003,93.529999,91.75,93.217003,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1
3,2009-01-06,14378.975391,1188.702997,0.0125,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,-0.064,-0.19,-0.075,0.295,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,127.099998,127.120003,125.191002,126.058998,137.082001,140.75,135.539993,139.190994,0.67981,0.68896,0.66712,0.6706,0.73358,0.75086,0.73358,0.74019,1.1876,1.1988,1.176,1.1841,1.4006,1.4205,1.3767,1.3882,13.4406,13.4979,13.269,13.355,7.7514,7.7547,7.7433,7.7526,93.220001,94.566002,92.870003,93.379997,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1
4,2009-01-07,14378.975391,1188.702997,0.0125,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,-0.064,-0.19,-0.075,0.295,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,126.029999,127.559998,125.899002,126.18,139.194,141.380005,138.009995,139.718994,0.67056,0.67478,0.65552,0.66317,0.74077,0.7442,0.7277,0.7347,1.1841,1.1989,1.1776,1.1962,1.3889,1.4087,1.3774,1.4079,13.349,13.4855,13.318,13.435,7.7527,7.7582,7.7418,7.7506,93.365997,94.059998,92.419998,92.68,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1,52184.0,1


In [16]:
processed_data.tail()

Unnamed: 0,date,trading_volume,rank,trades_per_minute,volatility,bid_and_ask_spread,bid_and_ask_sum_asks,bid_and_ask_sum_bids,time_between_blocks,block_size_votes,marketcap3sma,marketcap7sma,marketcap14sma,marketcap30sma,marketcap90sma,marketcap3ema,marketcap7ema,marketcap14ema,marketcap30ema,marketcap90ema,marketcap3wma,marketcap7wma,marketcap14wma,marketcap30wma,marketcap90wma,marketcap3trx,marketcap7trx,marketcap14trx,marketcap30trx,marketcap90trx,marketcap3mom,marketcap7mom,marketcap14mom,marketcap30mom,marketcap90mom,marketcap3std,marketcap7std,marketcap14std,marketcap30std,marketcap90std,marketcap3var,marketcap7var,marketcap14var,marketcap30var,marketcap90var,marketcap3rsi,marketcap7rsi,marketcap14rsi,marketcap30rsi,marketcap90rsi,...,eur_jpy_exchange_open,eur_jpy_exchange_high,eur_jpy_exchange_low,eur_jpy_exchange_close,gbp_jyp_exchange_open,gbp_jyp_exchange_high,gbp_jyp_exchange_low,gbp_jyp_exchange_close,usd_gbp_exchange_open,usd_gbp_exchange_high,usd_gbp_exchange_low,usd_gbp_exchange_close,usd_eur_exchange_open,usd_eur_exchange_high,usd_eur_exchange_low,usd_eur_exchange_close,usd_cad_exchange_open,usd_cad_exchange_high,usd_cad_exchange_low,usd_cad_exchange_close,usd_aud_exchange_open,usd_aud_exchange_high,usd_aud_exchange_low,usd_aud_exchange_close,usd_mxn_exchange_open,usd_mxn_exchange_high,usd_mxn_exchange_low,usd_mxn_exchange_close,usd_hkd_exchange_open,usd_hkd_exchange_high,usd_hkd_exchange_low,usd_hkd_exchange_close,usd_jpy_exchange_open,usd_jpy_exchange_high,usd_jpy_exchange_low,usd_jpy_exchange_close,shifted_0,binary_price_change_0,shifted_1,binary_price_change_1,shifted_7,binary_price_change_7,shifted_14,binary_price_change_14,shifted_30,binary_price_change_30,shifted_60,binary_price_change_60,shifted_90,binary_price_change_90
4465,2021-03-26,14378.975391,1188.702997,167.461944,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,0.308,0.699,0.877,0.948,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,128.563995,129.445007,128.529007,128.570999,150.085007,151.526001,150.035995,150.089996,0.72739,0.72762,0.72412,0.72732,0.84918,0.84943,0.84717,0.84914,1.26031,1.26046,1.25611,1.26018,1.316656,1.3178,1.309415,1.316829,20.6721,20.727301,20.532801,20.683029,7.76852,7.7699,7.76822,7.76838,109.181999,109.834,109.139,109.179001,52184.0,1,52184.0,1,1.0,0,1.0,0,1.0,0,1.0,0,1.0,0
4466,2021-03-27,14378.975391,1188.702997,167.461944,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,0.308,0.699,0.877,0.948,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,128.83933,129.426671,128.630335,128.851334,150.500005,151.615402,150.213999,150.525996,0.72666,0.727377,0.72348,0.726583,0.848903,0.84969,0.84742,0.848867,1.259687,1.26112,1.256673,1.259593,1.314185,1.316208,1.308377,1.314336,20.658696,20.746587,20.5487,20.666196,7.768647,7.771633,7.768443,7.768537,109.380666,109.815666,109.220668,109.383001,52184.0,1,52184.0,1,1.0,0,1.0,0,1.0,0,1.0,0,1.0,0
4467,2021-03-28,14378.975391,1188.702997,167.461944,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,0.308,0.699,0.877,0.948,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,129.114665,129.408335,128.731664,129.131668,150.915003,151.704803,150.392003,150.961995,0.72593,0.727133,0.72284,0.725847,0.848626,0.84995,0.84767,0.848593,1.259063,1.26178,1.257237,1.259007,1.311714,1.314617,1.307338,1.311843,20.645293,20.765874,20.5646,20.649363,7.768773,7.773367,7.768667,7.768693,109.579333,109.797333,109.302335,109.587001,52184.0,1,52184.0,1,1.0,0,1.0,0,1.0,0,1.0,0,1.0,0
4468,2021-03-29,14378.975391,1188.702997,167.461944,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,0.308,0.699,0.877,0.948,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,129.389999,129.389999,128.832993,129.412003,151.330002,151.794205,150.570007,151.397995,0.7252,0.72689,0.7222,0.72511,0.848349,0.85021,0.84792,0.84832,1.25844,1.26244,1.2578,1.25842,1.309243,1.313025,1.3063,1.30935,20.631889,20.78516,20.5805,20.63253,7.7689,7.7751,7.76889,7.76885,109.778,109.778999,109.384003,109.791,52184.0,1,52184.0,1,1.0,0,1.0,0,1.0,0,1.0,0,1.0,0
4469,2021-03-30,14378.975391,1188.702997,167.461944,787.825957,0.31782,76087140.0,68660470.0,8.141667,20.0,1037046000000.0,1065113000000.0,1067361000000.0,989044500000.0,797089900000.0,1037697000000.0,1050868000000.0,1044357000000.0,998452900000.0,813375600000.0,1032082000000.0,1052522000000.0,1062149000000.0,1027730000000.0,896280900000.0,-0.897,0.308,0.699,0.877,0.948,-39094220000.0,-16877040000.0,-2127435000.0,10353220000.0,597221000000.0,33440580000.0,55524290000.0,58527540000.0,166743700000.0,377287900000.0,2.795682e+20,7.707367e+20,8.563682e+20,6.950866e+21,3.558654e+22,29.393,44.274,53.277,58.277,61.742,...,129.199005,129.548004,129.143997,129.248993,151.117996,151.942993,151.029999,151.298004,0.72658,0.72944,0.72548,0.72891,0.8495,0.8536,0.8489,0.853,1.2589,1.26474,1.2576,1.26346,1.3096,1.3175,1.3045,1.31725,20.604349,20.706421,20.541,20.61586,7.7726,7.7763,7.77193,7.77466,109.814003,110.426003,109.730003,110.278,52184.0,1,1.0,0,1.0,0,1.0,0,1.0,0,1.0,0,1.0,0


# ---------------------------------------------------------------------------------------------------------------

# Iterate through the intervals, # features and shift periods & select the most important features from the data

In [17]:
plot_data = True
print_vif_outputs = True
print_outliers = True
reg_perc_to_retain = 0.71
clas_perc_to_retain = 0.77

In [20]:
for reg_or_clas in tqdm(types_of_analysis):
    print("==========\n  ", reg_or_clas, "  \n==========")
    
    # set the percentage of feature importance to retain
    percent_importance_to_retain = reg_perc_to_retain if reg_or_clas == 'reg' else clas_perc_to_retain
    
    for interval_num in range(1, len(train_intervals_list)+1):
        print("-------------")
        print("Interval:", interval_num)

        # get the start and end dates for this interval
        train_start_date, train_end_date = train_intervals_list[interval_num-1]

        # define the train data for this date interval
        train_df = fs.get_subset_of_data(processed_data, train_start_date, train_end_date)
        fs.get_date_range_of_data(train_df) 

        # iterate over the shift periods and select the features based on this period
        for shift_period in shift_periods_in_days_list:

            # get a dataframe of the data's features
            train_features, train_cols_to_drop, train_colname = fs.get_features(train_df, shift_period, reg_or_clas)
        
            # get a numpy array of the feature we are trying to predict
            y = np.ravel(train_features[train_colname])

            # use Random Forest model and Variance Inflation Factor (VIF) to get the most important columns
            dropped_merged_df = fs.use_rand_forests_and_vif_to_subset_features(reg_or_clas, train_features, y, config_variables, train_colname, print_vif_outputs)

            if 'price' not in dropped_merged_df.columns:
                dropped_merged_df['price'] = train_features['price']
            
            # Use random forest to score feature importance - then only keep specified % importance
            selected_features_df = fs.further_subset_features_using_feat_importance(reg_or_clas, dropped_merged_df, y, percent_importance_to_retain, plot_data, random_seed)

            # add the created price column back into the data
            selected_features_df["_".join(train_colname.split('_')[:-1])] = y

            # Remove the outliers in the data using an Isolation forest
            final_df = fs.remove_outliers(selected_features_df, y, reg_or_clas, print_outliers)

            # Output the data to a CSV
            train_path = os.path.join(data_directory, 'Feat_select', '{}_rf_vif_and_if_interval_{}_shift_{}.csv'.format(reg_or_clas, interval_num, shift_period))
            final_df.to_csv(train_path, index=False)

# play an audio file to tell me that this feature selection has finished
Audio(audio_file_path, rate=5, autoplay=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

   clas   
-------------
Interval: 1
 - This data conatins 2160 days from '2015-02-01' to '2020-12-31'


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 
---Indicators---
Dropped:
 ['confirmationtime14ema', 'confirmationtime3ema', 'confirmationtime30wma', 'size90rsi', 'confirmationtime7ema', 'confirmationtime3wma', 'confirmationtime90sma', 'tweets14rsi', 'size14sma', 'confirmationtime30sma', 'tweets30ema', 'hashrate30rsi', 'sentbyaddress30std', 'size7wma', 'tweets90ema', 'confirmationtime3sma', 'activeaddresses90wma', 'transactions14std', 'size3std', 'transactionvalue7std', 'size3mom', 'transactions90std', 'size90mom', 'sentbyaddress30var', 'tweets7rsi', 'top100cap30roc', 'tweets7sma', 'sentbyaddress14var']
Remaining variables:
 ['confirmationtime7mom', 'google_trends7trx', 'hashrate3trx', 'marketcap3rsi', 'median_transaction_fee90trx', 'size3roc', 'size3var', 'size90roc', 'size90var', 'top100cap14roc', 'transactionvalue14trx', 'transactionvalue14wma', 'transactionvalue30trx', 'transactionvalue7var', 'tweets14mom', 'tweets30mom', 'tweets7roc']


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 


KeyboardInterrupt: 