In [1]:
EXE_ENV_LOCAL = "LOC"
EXE_ENV_GCP = "GCP"

EXE_ENV = EXE_ENV_LOCAL # Exe Environment

# 1. Install and Import Libraries

## 1.1 Install Libararies

## 1.2 Import Libraries

In [25]:

from google.cloud import storage
import itertools
import ast
from joblib import dump,load

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

from sklearn.feature_selection import SelectFromModel, VarianceThreshold, RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor


In [20]:
import pandas as pd
import numpy as np
import os, sys
from datetime import datetime

from sklearn.ensemble import RandomForestRegressor

# Visulization
import seaborn as sns

# Owned

estherx_utils_dir = os.path.abspath('/Users/estherx/Documents/EstherxScripts/python/utils')
sys.path.append(estherx_utils_dir)  



# Owend
if EXE_ENV==EXE_ENV_GCP:

    reddit_dir = "data/Reddit"
    train_test_lstm_file_path = f"{reddit_dir}/train_test_lstm_reddit_data.csv"
    
    sys.path.append('/home/jupyter/utils')
    from esx_GCPStorageManager import GCPSManager

if EXE_ENV==EXE_ENV_LOCAL:
    reddit_dir = "/Users/estherx/Documents/EstherxScripts/python/stonkgo/stonkgo_v1.0.0/data_processing/Reddit"
    
    train_test_lstm_file_path = f"{reddit_dir}/train_test_lstm_reddit_data.csv"
    sys.path.append('/Users/estherx/Documents/EstherxScripts/python/stonkgo/stonkgo_v1.0.0/utils')
from esx_DataSplitter import split_data_by_cutoff_point
    
# Constants
HORIZON_DAYS = 5 # Forcast next 5 days
HORIZON_HOURS = HORIZON_DAYS * 24

# 1.Load the Data

In [11]:
def load_merged_data(train_test_lstm_file_path, is_gcp=EXE_ENV):
    if is_gcp == EXE_ENV_GCP:
            
        gcps = GCPSManager(bucket_name="adsp-capstone-enique-data")

        merged_df = gcps.read_csv(train_test_lstm_file_path)

    else:
        # Load the data from local
        merged_df = pd.read_csv(train_test_lstm_file_path)
        
    merged_df['ds'] = pd.to_datetime(merged_df['ds'])
    
    return  merged_df

merged_df = load_merged_data(train_test_lstm_file_path)

In [12]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48639 entries, 0 to 48638
Data columns (total 60 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   ds                             48639 non-null  datetime64[ns]
 1   open_price                     48639 non-null  float64       
 2   y_upper                        48639 non-null  float64       
 3   y_lower                        48639 non-null  float64       
 4   y                              48639 non-null  float64       
 5   volume                         48639 non-null  float64       
 6   quote_asset_volume             48639 non-null  float64       
 7   num_of_trades                  48639 non-null  int64         
 8   taker_buy_base_asset_volume    48639 non-null  float64       
 9   taker_buy_quote_asset_volume   48639 non-null  float64       
 10  RSI                            48639 non-null  float64       
 11  MACD           

In [14]:
train_lstm, test_lstm = split_data_by_cutoff_point(merged_df,test_rows = 120)

In [15]:
train_lstm.set_index('ds',inplace=True)

# 2.Feature Selection - Merged Data

## 2.1 Correlation Analysis

In [16]:
corrs = train_lstm.corr()

# Get absolute values for specific columns and sort each individually
target_corrs = corrs['y'].abs().sort_values(ascending=False)

In [17]:
# Set a correlation threshold
corr_threshold = 0.1

# Filter features based on the threshold
corr_features = target_corrs[target_corrs > corr_threshold].index.tolist()

# Exclude the target column itself from the list if it's present
corr_features = [feature for feature in corr_features if feature != 'y']

In [None]:
# Plot the correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(train_lstm[['y'] + corr_features].corr(), annot=True, cmap='coolwarm')
plt.show()

## 2.2 Feature Importance using RandomForest

In [18]:
X_feature_selection = train_lstm.drop(columns=['y'])  
y_feature_selection = train_lstm[['y']]

In [21]:
# Fit RandomForest to estimate feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_feature_selection, y_feature_selection)

  return fit_method(estimator, *args, **kwargs)


In [22]:
feature_importances = pd.Series(rf.feature_importances_, index=X_feature_selection.columns)
feature_importances = feature_importances.sort_values(ascending=False)

In [23]:
# Select top 20 features
rf_features = feature_importances.index[:20] 
rf_features

Index(['typical_price', 'y_lower', 'y_upper', 'open_price', 'prophet_forecast',
       'bollinger_upper', 'positive_money_flow', 'negative_money_flow',
       'bollinger_lower', 'RSI', 'num_of_trades', 'MACD_diff',
       'taker_buy_base_asset_volume', 'MACD_signal',
       'taker_buy_quote_asset_volume', 'MACD', 'prophet_weekly',
       'quote_asset_volume', 'volume', 'raw_money_flow'],
      dtype='object')

## 2.3 Variance Threshold

Remove features with low variance

In [26]:
# Set a variance threshold
selector = VarianceThreshold(threshold=0.1)  
selector.fit(X_feature_selection)

In [27]:
vt_features = X_feature_selection.columns[selector.get_support()]
vt_features

Index(['open_price', 'y_upper', 'y_lower', 'volume', 'quote_asset_volume',
       'num_of_trades', 'taker_buy_base_asset_volume',
       'taker_buy_quote_asset_volume', 'RSI', 'MACD', 'MACD_signal',
       'MACD_diff', 'typical_price', 'raw_money_flow', 'positive_money_flow',
       'negative_money_flow', 'MFI', 'bollinger_upper', 'bollinger_lower',
       'prophet_forecast', 'prophet_trend', 'prophet_yhat_low',
       'prophet_yhat_high'],
      dtype='object')

## 2.4 Recursive Feature Elimination (RFE)

To recursively eliminate less important features.

In [29]:
# Function to remove zero variance columns
def remove_zero_variance(df):
    return df.loc[:, (df != df.iloc[0]).any()]

X_feature_selection_no_zero = remove_zero_variance(X_feature_selection)

In [30]:
# Use RFE with a LinearRegression model
lr = LinearRegression()
rfe = RFE(estimator=lr, n_features_to_select=20)  
rfe.fit(X_feature_selection_no_zero, y_feature_selection)

In [31]:
# Extract selected features
rfe_features= X_feature_selection_no_zero.columns[rfe.support_]
rfe_features

Index(['y_upper', 'y_lower', 'RSI', 'typical_price', 'raw_money_flow',
       'positive_money_flow', 'negative_money_flow', 'sentiment_positive',
       'sentiment_negative', 'sentiment_neutral',
       'sentiment_positive_momentum', 'sentiment_negative_momentum',
       'sentiment_neutral_momentum', 'sentiment_positive_lag_1hr',
       'sentiment_negative_lag_1hr', 'sentiment_neutral_lag_1hr',
       'sentiment_positive_lag_6hr', 'sentiment_positive_lag_7hr',
       'sentiment_negative_lag_7hr', 'sentiment_neutral_lag_7hr'],
      dtype='object')

In [33]:
rfe_results = {}

# Loop through each target column
for target_col in ['y']:  # You can add more target columns if needed
    y_target = y_feature_selection[target_col]

    # Step 1: Fit RFE with Linear Regression
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=20)
    rfe.fit(X_feature_selection_no_zero, y_target)

    # Step 2: Extract selected features
    X_selected_rfe = X_feature_selection_no_zero[rfe_features]

    # Step 3: Fit a linear regression model with statsmodels
    X_selected_rfe = sm.add_constant(X_selected_rfe)  # Add a constant term for the intercept
    model_rfe = sm.OLS(y_target, X_selected_rfe).fit()

    # Step 4: Check the p-values
    p_values = model_rfe.pvalues

    # Display results
    print(f"\nSelected Features and their p-values for {target_col} target:")
    for feature, p_value in zip(['const'] + rfe_features.tolist(), p_values):  # Include the intercept
        print(f"{feature}: {p_value:.4f}")

    # Save the results
    rfe_results[target_col] = model_rfe.summary()

    # Filter features based on p-value threshold
    p_value_threshold = 0.05
    selected_features = p_values[p_values < p_value_threshold].index.tolist()

    # Remove the intercept if it's in the selected features
    if 'const' in selected_features:
        selected_features.remove('const')

    print("Selected features based on p-values:", selected_features)

    # Check for multicollinearity
    # LSTM Considerations: While LSTMs are generally robust to multicollinearity, 
    # ensuring data quality and removing highly collinear features can improve model stability and performance.
    
    vif_data = pd.DataFrame()
    vif_data['feature'] = X_selected_rfe.columns
    vif_data['VIF'] = [variance_inflation_factor(X_selected_rfe.values, i) for i in range(X_selected_rfe.shape[1])]

    print("Variance Inflation Factor (VIF) for each feature:")
    print(vif_data)

    # Filter features based on VIF threshold
    vif_threshold = 10  # Common threshold, but can vary
    features_to_keep = vif_data[vif_data['VIF'] < vif_threshold]['feature'].tolist()

    # Remove 'const' from the list if present
    if 'const' in features_to_keep:
        features_to_keep.remove('const')

    print("Selected features after VIF filtering:", features_to_keep)

    # Update the results
    rfe_results[target_col] = {
        "model_summary": model_rfe.summary(),
        "selected_features": selected_features,
        "vif_filtered_features": features_to_keep
    }


Selected Features and their p-values for y target:
const: 1.0000
y_upper: 0.0000
y_lower: 0.0000
RSI: 1.0000
typical_price: 0.0000
raw_money_flow: 1.0000
positive_money_flow: 1.0000
negative_money_flow: 1.0000
sentiment_positive: 1.0000
sentiment_negative: 1.0000
sentiment_neutral: 1.0000
sentiment_positive_momentum: 1.0000
sentiment_negative_momentum: 1.0000
sentiment_neutral_momentum: 1.0000
sentiment_positive_lag_1hr: 1.0000
sentiment_negative_lag_1hr: 1.0000
sentiment_neutral_lag_1hr: 1.0000
sentiment_positive_lag_6hr: 1.0000
sentiment_positive_lag_7hr: 1.0000
sentiment_negative_lag_7hr: 1.0000
sentiment_neutral_lag_7hr: 1.0000
Selected features based on p-values: ['y_upper', 'y_lower', 'typical_price']


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


Variance Inflation Factor (VIF) for each feature:
                        feature           VIF
0                         const  0.000000e+00
1                       y_upper  6.969908e+04
2                       y_lower  6.063394e+04
3                           RSI  1.119654e+00
4                 typical_price  1.839175e+05
5                raw_money_flow           inf
6           positive_money_flow           inf
7           negative_money_flow           inf
8            sentiment_positive  2.262843e+05
9            sentiment_negative  1.668660e+05
10            sentiment_neutral  2.317199e+05
11  sentiment_positive_momentum  1.057904e+06
12  sentiment_negative_momentum  6.914247e+06
13   sentiment_neutral_momentum  4.569300e+05
14   sentiment_positive_lag_1hr  4.191033e+06
15   sentiment_negative_lag_1hr  1.126874e+05
16    sentiment_neutral_lag_1hr  1.625002e+04
17   sentiment_positive_lag_6hr  1.174302e+00
18   sentiment_positive_lag_7hr  7.713631e+06
19   sentiment_negative_lag_7h

In [34]:
print(rfe_results['y']['model_summary'])


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.738e+10
Date:                Tue, 30 Jul 2024   Prob (F-statistic):               0.00
Time:                        11:23:00   Log-Likelihood:            -1.7178e+05
No. Observations:               48519   AIC:                         3.436e+05
Df Residuals:                   48504   BIC:                         3.437e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

In [35]:
rfe_features_final = rfe_results['y']['vif_filtered_features']
rfe_features_final

['RSI', 'sentiment_positive_lag_6hr']

## 2.5 Final Selected Features

Combine the results from different methods to get the final set of features for the LSTM model.

In [37]:
# Combine features from different methods
selected_features_vif_filtered = list(set(corr_features) | set(rf_features) | set(vt_features) | set(rfe_features_final))

print("Final selected features for LSTM model:\n", selected_features_vif_filtered)

Final selected features for LSTM model:
 ['sentiment_neutral_lag_2hr', 'bollinger_lower', 'prophet_yhat_low', 'sentiment_positive_lag_6hr', 'sentiment_neutral', 'sentiment_negative_lag_7hr', 'open_price', 'raw_money_flow', 'sentiment_neutral_lag_7hr', 'RSI', 'sentiment_neutral_volatility', 'sentiment_neutral_lag_3hr', 'MACD_diff', 'num_of_trades', 'prophet_yhat_high', 'y_lower', 'bollinger_upper', 'tw_avg_negative', 'sentiment_positive_volatility', 'quote_asset_volume', 'sentiment_negative_lag_6hr', 'sentiment_neutral_lag_1hr', 'MACD', 'MFI', 'sentiment_negative_lag_3hr', 'sentiment_neutral_lag_5hr', 'volume', 'tw_avg_positive', 'sentiment_negative', 'prophet_weekly', 'taker_buy_base_asset_volume', 'sentiment_negative_volatility', 'prophet_yearly', 'tw_avg_neutral', 'typical_price', 'MACD_signal', 'sentiment_neutral_lag_4hr', 'positive_money_flow', 'sentiment_neutral_lag_6hr', 'sentiment_negative_lag_5hr', 'sentiment_negative_lag_2hr', 'taker_buy_quote_asset_volume', 'y_upper', 'negati

In [41]:
len(selected_features_vif_filtered)

48

In [42]:
len(selected_features_top)

54

In [38]:
# Combine features from different methods
selected_features_top = list(set(corr_features) | set(rf_features) | set(vt_features) | set(rfe_features))

print("Final selected features for LSTM model:\n", selected_features_top)

Final selected features for LSTM model:
 ['sentiment_neutral_lag_2hr', 'bollinger_lower', 'prophet_yhat_low', 'sentiment_positive_lag_6hr', 'sentiment_positive_lag_1hr', 'sentiment_neutral', 'sentiment_negative_lag_7hr', 'open_price', 'raw_money_flow', 'sentiment_neutral_lag_7hr', 'RSI', 'sentiment_neutral_volatility', 'sentiment_neutral_lag_3hr', 'MACD_diff', 'num_of_trades', 'prophet_yhat_high', 'y_lower', 'bollinger_upper', 'tw_avg_negative', 'sentiment_positive_volatility', 'quote_asset_volume', 'sentiment_negative_lag_6hr', 'sentiment_neutral_lag_1hr', 'MACD', 'MFI', 'sentiment_positive_momentum', 'sentiment_negative_momentum', 'sentiment_negative_lag_3hr', 'sentiment_neutral_lag_5hr', 'volume', 'tw_avg_positive', 'sentiment_negative', 'prophet_weekly', 'taker_buy_base_asset_volume', 'sentiment_negative_volatility', 'prophet_yearly', 'tw_avg_neutral', 'typical_price', 'MACD_signal', 'sentiment_neutral_lag_4hr', 'positive_money_flow', 'sentiment_neutral_lag_6hr', 'sentiment_negativ

In [43]:
# Convert lists to sets
set_vif_filtered = set(selected_features_vif_filtered)
set_top = set(selected_features_top)

# Find differences
only_in_vif_filtered = set_vif_filtered - set_top
only_in_top = set_top - set_vif_filtered

# Print the differences
print("Features only in VIF filtered list:", only_in_vif_filtered)
print("Features only in Top selected list:", only_in_top)

Features only in VIF filtered list: set()
Features only in Top selected list: {'sentiment_positive_lag_7hr', 'sentiment_positive_momentum', 'sentiment_positive_lag_1hr', 'sentiment_neutral_momentum', 'sentiment_positive', 'sentiment_negative_momentum'}


In [44]:
len(set_vif_filtered)

48

In [45]:
len(set_top)

54

In [39]:
rfe_features_final

['RSI', 'sentiment_positive_lag_6hr']

In [40]:
rfe_features

Index(['y_upper', 'y_lower', 'RSI', 'typical_price', 'raw_money_flow',
       'positive_money_flow', 'negative_money_flow', 'sentiment_positive',
       'sentiment_negative', 'sentiment_neutral',
       'sentiment_positive_momentum', 'sentiment_negative_momentum',
       'sentiment_neutral_momentum', 'sentiment_positive_lag_1hr',
       'sentiment_negative_lag_1hr', 'sentiment_neutral_lag_1hr',
       'sentiment_positive_lag_6hr', 'sentiment_positive_lag_7hr',
       'sentiment_negative_lag_7hr', 'sentiment_neutral_lag_7hr'],
      dtype='object')