In [1]:
from FeatureEngineeringUtils.btc_feature_engineering_utils import preprocess_m1_df, BtcPreprocessor, TargetExtractor, DataMixer, FeatureEngineer, FeatureExtractor, Exogenous, Target
from FeatureEngineeringUtils.exo_feature_engineering_utils import ExoPreprocessor, recreate_the_h4_candles_from_hourly, recreate_the_spx_h1_candles_from_quarterly, remove_the_volume_column, make_sure_no_row_has_volume_equal_to_zero

# Part1) Exogenous data preprocessing

## A) Recreation of higher timeframes from lower ones:

In [2]:
recreate_the_spx_h1_candles_from_quarterly()

In [3]:
for name in ['DXY', 'GOLD', 'SPX', 'UKOIL']:
    from_reconstructed_data = True if name == 'SPX' else False
    recreate_the_h4_candles_from_hourly(name, first_day='2018-11-01 00:00:00', from_reconstructed_data=from_reconstructed_data)

### B) Drop the volume from DXY and replace where volume == 0 for others

In [4]:
names = ['BTCD', 'TOTAL', 'TOTAL2', 'TOTAL3', 'USDTD', 'DXY', 'GOLD', 'SPX', 'UKOIL']
timeframes = [240, 60, 15]
for name in names:
    for timeframe in timeframes:
        reconstructed = True if (((name in ['DXY', 'GOLD', 'SPX', 'UKOIL']) and (timeframe == 240)) or ((name == 'SPX') and (timeframe == 60))) else False
        if name == 'DXY':
            remove_the_volume_column(name, timeframe, reconstructed)
        else:
            make_sure_no_row_has_volume_equal_to_zero(name, timeframe, reconstructed)

volume column modified for Data//BTCD_240.csv
volume column modified for Data//BTCD_60.csv
volume column modified for Data//BTCD_15.csv
volume column modified for Data//TOTAL_240.csv
volume column modified for Data//TOTAL_60.csv
volume column modified for Data//TOTAL_15.csv
volume column modified for Data//TOTAL2_240.csv
volume column modified for Data//TOTAL2_60.csv
volume column modified for Data//TOTAL2_15.csv
volume column modified for Data//TOTAL3_240.csv
volume column modified for Data//TOTAL3_60.csv
volume column modified for Data//TOTAL3_15.csv
volume column modified for Data//USDTD_240.csv
volume column modified for Data//USDTD_60.csv
volume column modified for Data//USDTD_15.csv
volume column modified for RepairedExoData//GOLD_240.csv
volume column modified for Data//GOLD_60.csv
volume column modified for Data//GOLD_15.csv
volume column modified for RepairedExoData//SPX_240.csv
volume column modified for RepairedExoData//SPX_60.csv
volume column modified for Data//SPX_15.csv


## C) Calculation of the indicators and interpolation of the missing values:

In [5]:
names = ['BTCD', 'TOTAL', 'TOTAL2', 'TOTAL3', 'USDTD', 'DXY', 'GOLD', 'SPX', 'UKOIL']
timeframes = [240, 60, 15]
first_days = {'240': '2019-01-01 00:00:00', '60': '2021-05-01 00:00:00', '15': '2021-09-20 00:00:00'}
for name in names:
    for timeframe in timeframes:
        from_reconstructed_data = True if (((name in ['DXY', 'GOLD', 'SPX', 'UKOIL']) and (timeframe == 240)) or ((name == 'SPX') and (timeframe == 60))) else False 
        exo_preprocessor = ExoPreprocessor(name, timeframe, from_reconstructed_data=from_reconstructed_data, first_day=first_days.get(str(timeframe)))
        exo_preprocessor.run()

# Part2) Endogenous data preprocessing and target definition

## A) Interpolattion of the m1 missing values:

In [6]:
preprocess_m1_df('Data//BTCUSDT_1.csv')

## B) Interpolation of the data and calculation of the indicators:

In [7]:
for timeframe in [240, 60, 15]:
    btc_preprocessor = BtcPreprocessor(original_timeframe=240, timeframe=timeframe)
    btc_preprocessor.run()

## C) Defining the targets:

In [8]:
for timeframe in [240, 60, 15]:
    target_extractor = TargetExtractor(original_timeframe=240, timeframe=timeframe)
    target_extractor.run()

# Part3) Mixing the endogenous and the exogenous features and croping the common parts

In [9]:
names = {'target1': {'regular': {'240': 'mix_4h_for_target1', '60': 'mix_1h_for_target1', '15': 'mix_15m_for_target1'},
                    'none': {'240': 'btc_4h_for_target1', '60': 'btc_1h_for_target1', '15': 'btc_15m_for_target1'}},
        }
first_days = {'regular': {'240': '2019-01-01 00:00:00', '60': '2021-05-01 00:00:00', '15': '2021-09-20 00:00:00'},
             'none': {'240': '2017-08-28 16:00:00', '60': '2017-08-28 19:00:00', '15': '2017-08-28 19:45:00'}}

last_days = {'target1': '2021-12-31 20:00:00'}

for target in ['target1']:
    for use_exo_data in ['regular', 'none']:
        for timeframe in [240, 60, 15]:
            data_mixer = DataMixer(df_name=names.get(target).get(str(use_exo_data)).get(str(timeframe)),
                                   timeframe=timeframe, target=target, use_exogenous_data=use_exo_data,
                                   first_day=first_days.get(str(use_exo_data)).get(str(timeframe)),
                                   last_day=last_days.get(target))
            data_mixer.run()

# Part4) Preparation of the data for train, test and validation

In [10]:
df_names = {'target1': {'regular': {'240': 'mix_4h_for_target1', '60': 'mix_1h_for_target1', '15': 'mix_15m_for_target1'},
                    'none': {'240': 'btc_4h_for_target1', '60': 'btc_1h_for_target1', '15': 'btc_15m_for_target1'}},
        }
model_name_suffix = {'target1': '1_1'}

for target in ['target1']:
    print('target= ', target)
    for use_exo_data in ['regular', 'none']:
        for timeframe in [240, 60, 15]:
            engineer = FeatureEngineer(df_name=df_names.get(target).get(str(use_exo_data)).get(str(timeframe)),
                                       model_name=df_names.get(target).get(str(use_exo_data)).get(str(timeframe))[0:-1] + model_name_suffix.get(target),
                                       target=target, maximum_allowed_correlation_between_features=0.95)
            engineer.run()
            model_name = engineer.model_name
            initial_num_features = engineer.initial_number_of_features
            final_num_features = engineer.number_of_features
            print(f'for {model_name}: initial number of features: {initial_num_features}        final number of features : {final_num_features}')

target=  target1
for mix_4h_for_target1_1: initial number of features: 956        final number of features : 618
for mix_1h_for_target1_1: initial number of features: 956        final number of features : 594
for mix_15m_for_target1_1: initial number of features: 956        final number of features : 652
for btc_4h_for_target1_1: initial number of features: 98        final number of features : 74
for btc_1h_for_target1_1: initial number of features: 98        final number of features : 74
for btc_15m_for_target1_1: initial number of features: 98        final number of features : 74


In [11]:
names = {'target1': {'regular': {'240': 'mix_4h_for_target1', '60': 'mix_1h_for_target1', '15': 'mix_15m_for_target1'},
                    'none': {'240': 'btc_4h_for_target1', '60': 'btc_1h_for_target1', '15': 'btc_15m_for_target1'}},
        }

model_name_suffix = {'target1': '1_1'}

for target in ['target1']:
    for use_exo_data in ['regular', 'none']:
        for timeframe in [240, 60, 15]:
            feature_extractor = FeatureExtractor(df_name=names.get(target).get(str(use_exo_data)).get(str(timeframe)))
            feature_extractor.extract_features(model_name=df_names.get(target).get(str(use_exo_data)).get(str(timeframe))[0:-1] + model_name_suffix.get(target),
                                               save=True)