# Setup

### Import relevant features

In [21]:
# general packages
import os
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

In [2]:
# packages for the YAMl file
import yaml

In [17]:
# packages for feature importance
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

### Set the maximum output number of columns and rows

In [3]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

# ---------------------------------------------------------------------------------------------------------------

# Read in the variables from the YAML file

In [4]:
# read in YAML configuration file
with open("../../Config_files/config.yaml", "r") as variables:
    config_variables = yaml.load(variables, Loader=yaml.FullLoader)

In [5]:
data_directory = config_variables["data_directory"]

In [8]:
train_intervals_list = config_variables["train_intervals_list"]

In [15]:
shift_periods_in_days_list = config_variables["shift_periods_in_days_list"]

In [18]:
random_seed = config_variables["random_seed"]

# ---------------------------------------------------------------------------------------------------------------

# Read in the processed data

In [6]:
processed_data = pd.read_csv(os.path.join(data_directory, 'processed_data.csv'))

In [7]:
processed_data.shape

(4510, 1464)

# ---------------------------------------------------------------------------------------------------------------

# Assertain the most important features in our data

In [26]:
def get_cols_with_nan(df):

    nan_cols = []
    for col in df.columns:
        any_nan = df[col].isnull().values.any()
        if any_nan:
            nan_cols.append(col)

    return nan_cols


def drop_nan_cols(df):

    nan_cols = get_cols_with_nan(df)

    dropped = df.copy()
    print("Dropping the following 'Nan' columns:\n", nan_cols)
    return dropped.drop(columns=nan_cols)

In [9]:
def get_subset_of_data(df, start_date, end_date):
    
    """
    Take in a dataframe with a date column and output the subset of this dataframe using the specified date range of the input dataframe

    Parameters:
        df (dataframe) : A pandas dataframe that has a columns called "date"
        start_date (string) : A String of the minimum date we want in the subset dataframe
        end_date (string) : A String of the maximum date we want in the subset dataframe

    Returns:
        (dataframe) : A pandas dataframe which is a subset of the original dataframe whose data only contains values in between the specified start and end date
    """

    return df.loc[(df['date'] >= start_date) & (df['date'] <= end_date)].reset_index(drop=True)

In [12]:
def get_features(df, shift_period, reg_or_clas):

    if reg_or_clas == 'reg':
        # get a list of all the binary price columns needed for the classification analysis
        binary_price_cols_to_drop = list(df.filter(like="binary_price_change", axis=1).columns)

        # get a list of the shifted price columns not needed for this regression analysis
        shifted_cols = list(df.filter(like="shifted", axis=1).columns)
        train_colname = "shifted_{}".format(shift_period)
        shifted_price_cols_to_drop = [col for col in shifted_cols if col != train_colname]

    elif reg_or_clas == 'clas':
        # get a list of the shifted columns needed for the regression analysis
        shifted_price_cols_to_drop = list(df.filter(like="shifted", axis=1).columns)

        # get a list of the shifted binary price columns needed for the classification analysis
        binary_price_cols = list(df.filter(like="binary_price_change", axis=1).columns)
        train_colname = "binary_price_change_{}".format(shift_period)
        binary_price_cols_to_drop = [col for col in binary_price_cols if col != train_colname]

    cols_to_drop = ["date"] + binary_price_cols_to_drop + shifted_price_cols_to_drop

    # drop the features that are not to be used in the pca
    features = df.drop(columns=cols_to_drop)

    return features, cols_to_drop, train_colname

In [23]:
def use_rand_forest_to_get_feature_importance(reg_or_clas, feature_df, y, random_seed):

    # fit the RFC model
    if reg_or_clas == 'reg':
        rf_final = RandomForestRegressor(random_state=random_seed, n_jobs=-1)
    if reg_or_clas == 'clas':
        rf_final = RandomForestClassifier(random_state=random_seed, n_jobs=-1)
    rf_final.fit(feature_df, y)

    # make a dataframe of the feature importance
    feature_importance_dict = {"feature": feature_df.columns, "importance": rf_final.feature_importances_}
    feature_importance_df = pd.DataFrame(feature_importance_dict).sort_values(["importance"]).reset_index(drop=True)

    return feature_importance_df

In [24]:
reg_or_clas = "clas"

In [34]:
# iterate through the intervals
for train_start_date, train_end_date in tqdm(train_intervals_list):
    print("-------------")
    print("Time Interval:", train_start_date, "-->", train_end_date)
    time_interval_str = '{}_to_{}'.format(train_start_date, train_end_date)
    
    # define the train data for this date interval
    train_df = get_subset_of_data(processed_data, train_start_date, train_end_date)

    # drop any features that have Nan's in them
    subset_train_df = drop_nan_cols(train_df)
    
    # iterate over the shift periods and select the features based on this period
    for shift_period in shift_periods_in_days_list:

        # get a dataframe of the data's features
        train_features, train_cols_to_drop, train_colname = get_features(subset_train_df, shift_period, reg_or_clas)
        
        # get a numpy array of the feature we are trying to predict
        y = np.ravel(train_features[train_colname])
        
        # Use random forest to score feature importance
        feature_importance_df = use_rand_forest_to_get_feature_importance(reg_or_clas, train_features.drop(columns=[train_colname]), y, random_seed)

        print(feature_importance_df.to_markdown())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

-------------
Time Interval: 2015-08-17 --> 2020-12-31
Dropping the following 'Nan' columns:
 ['crypto_index_open', 'crypto_index_high', 'crypto_index_low', 'crypto_index_close', 'johannesburg_exchange_open', 'johannesburg_exchange_high', 'johannesburg_exchange_low', 'johannesburg_exchange_close', 'denmark_gov_debt_all', 'sweden_gov_debt_all']
|      | feature                                        |   importance |
|-----:|:-----------------------------------------------|-------------:|
|    0 | italy_gdp_at_market_prices                     |  0           |
|    1 | denmark_trust_in_legal_system                  |  0           |
|    2 | denmark_secondary_house_purchases              |  0           |
|    3 | euro_19_gdp_at_market_prices                   |  0           |
|    4 | euro_area_percent_afford_unexpected_expenses   |  0           |
|    5 | france_gdp_at_market_prices                    |  0           |
|    6 | denmark_gdp_at_market_prices                   |  0          

!! 1449 - tweets90roc
!! 1441 - tweets3rsi
^^ 1438 - marketcap7rsi
!! 1433 - tweets7mom
-- 1430 - google_trends3std
-- 1421 - google_trends3trx
^^ 1418 - marketcap14rsi
^^ 1415 - marketcap3rsi
!! 1410 - tweets3roc
!! 1401 - tweets7rsi
!! 1399 - tweets30roc
!! 1396 - tweets14trx
!! 1386 - tweets7roc
!! 1382 - tweets14roc
-- 1379 - google_trends3rsi
^^ 1368 - rank_coinbase 
!! 1365 - tweets3trx
^^ 1364 - marketcap14mom
^^ 1363 - marketcap3mom
^^ 1344 - rank_bit_x
++ 1333 - coffee_futures_close
-- 1329 - google_trends90roc
^^ 1326 - marketcap7roc
!! 1324 - tweets14mom
!! 1319 - tweets14rsi
-- 1318 - google_trends14mom
** 1316 - french_cac_40_volume
!! 1315 - tweets90mom
** 1309 - euro_stoxx_50_volume
-- 1305 - google_trends7trx
-- 1297 - google_trends14roc
++ 1292 - lean_hogs_futures_close
-- 1284 - google_trends7rsi
-- 1285 - google_trends7mom
++ 1273 - coffee_futures_low
++ 1270 - lumber_futures_volume
!! 1269 - tweets3mom
-- 1266 - google_trends7roc
++ 1254 - rough_rice_futures_volume
^^ 1252 - marketcap7trx
!! 1251 - tweets30mom

My features = 20.5 % of Top 200
-------
 - 1%   = stock indices
 - 2.5% = commidity futures
 - 4.5% = other internal features I added
 - 5%   = features on google trend of "bitcoin"
 - 7.5% = features on tweets containing "bitcoin"
