# Random Forest Training

---

### Import Libraries and Dependencies

In [18]:
# Import libraries and dependencies
import pandas as pd
import numpy as np
from pathlib import Path
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### Read in CSV as Pandas DataFrame

In [19]:
# Set path to CSV and read in CSV
csv_path = Path('trading_signals.csv')
trading_signals_df=pd.read_csv(csv_path)
trading_signals_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,daily_return,fast_close,slow_close,...,vol_trend_long,vol_trend_short,vol_trend_signal,bollinger_mid_band,bollinger_std,bollinger_upper_band,bollinger_lower_band,bollinger_long,bollinger_short,bollinger_signal
0,2016-01-06,25.139999,25.592501,24.967501,25.174999,23.323915,273829600,,25.174999,25.174999,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
1,2016-01-07,24.67,25.032499,24.1075,24.112499,22.339539,324377600,-0.042205,24.606984,24.640067,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
2,2016-01-08,24.637501,24.7775,24.190001,24.24,22.457672,283192000,0.005288,24.467362,24.504858,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
3,2016-01-11,24.7425,24.764999,24.334999,24.6325,22.821304,198957600,0.016192,24.517584,24.537435,...,1.0,0.0,1.0,,,,,0.0,0.0,0.0
4,2016-01-12,25.137501,25.172501,24.709999,24.99,23.152521,196616800,0.014513,24.639892,24.630475,...,1.0,0.0,1.0,,,,,0.0,0.0,0.0


### Set Index, Infer DateTimeFormat, and Drop Extraneous Columns

In [20]:
# Set index as datetime object and drop extraneous columns
trading_signals_df.set_index(pd.to_datetime(trading_signals_df['Date'], infer_datetime_format=True), inplace=True)
trading_signals_df.drop(columns=['Date'], inplace=True)
trading_signals_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,daily_return,fast_close,slow_close,crossover_long,...,vol_trend_long,vol_trend_short,vol_trend_signal,bollinger_mid_band,bollinger_std,bollinger_upper_band,bollinger_lower_band,bollinger_long,bollinger_short,bollinger_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-06,25.139999,25.592501,24.967501,25.174999,23.323915,273829600,,25.174999,25.174999,0.0,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
2016-01-07,24.67,25.032499,24.1075,24.112499,22.339539,324377600,-0.042205,24.606984,24.640067,0.0,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
2016-01-08,24.637501,24.7775,24.190001,24.24,22.457672,283192000,0.005288,24.467362,24.504858,0.0,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
2016-01-11,24.7425,24.764999,24.334999,24.6325,22.821304,198957600,0.016192,24.517584,24.537435,0.0,...,1.0,0.0,1.0,,,,,0.0,0.0,0.0
2016-01-12,25.137501,25.172501,24.709999,24.99,23.152521,196616800,0.014513,24.639892,24.630475,1.0,...,1.0,0.0,1.0,,,,,0.0,0.0,0.0


### Set X-Variable List and Filter to Obtain Associated Values

In [21]:
# Set x variable list of features
x_var_list = ['crossover_signal', 'vol_trend_signal', 'bollinger_signal']

# Filter by x-variable list
trading_signals_df[x_var_list].tail()

Unnamed: 0_level_0,crossover_signal,vol_trend_signal,bollinger_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-29,1.0,1.0,-1.0
2020-12-30,1.0,1.0,-1.0
2020-12-31,1.0,1.0,-1.0
2021-01-04,1.0,1.0,-1.0
2021-01-05,1.0,1.0,-1.0


### Shift the DataFrame Index by 1

In [22]:
# Shift DataFrame values by 1 (this is so we can have a predictor as to whether or not it's a buy or sell)
trading_signals_df[x_var_list] = trading_signals_df[x_var_list].shift(1)
trading_signals_df[x_var_list].tail()

Unnamed: 0_level_0,crossover_signal,vol_trend_signal,bollinger_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-29,1.0,1.0,-1.0
2020-12-30,1.0,1.0,-1.0
2020-12-31,1.0,1.0,-1.0
2021-01-04,1.0,1.0,-1.0
2021-01-05,1.0,1.0,-1.0


### Drop NAs and Replace Infs (Positive/Negative Infinity) 

In [23]:
# Drop NAs and replace positive/negative infinity values - because of the moving averages
trading_signals_df.dropna(subset=x_var_list, inplace=True)
trading_signals_df.dropna(subset=['daily_return'], inplace=True)
trading_signals_df = trading_signals_df.replace([np.inf, -np.inf], np.nan)
trading_signals_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,daily_return,fast_close,slow_close,crossover_long,...,vol_trend_long,vol_trend_short,vol_trend_signal,bollinger_mid_band,bollinger_std,bollinger_upper_band,bollinger_lower_band,bollinger_long,bollinger_short,bollinger_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-07,24.67,25.032499,24.1075,24.112499,22.339539,324377600,-0.042205,24.606984,24.640067,0.0,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
2016-01-08,24.637501,24.7775,24.190001,24.24,22.457672,283192000,0.005288,24.467362,24.504858,0.0,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
2016-01-11,24.7425,24.764999,24.334999,24.6325,22.821304,198957600,0.016192,24.517584,24.537435,0.0,...,1.0,0.0,0.0,,,,,0.0,0.0,0.0
2016-01-12,25.137501,25.172501,24.709999,24.99,23.152521,196616800,0.014513,24.639892,24.630475,1.0,...,1.0,0.0,1.0,,,,,0.0,0.0,0.0
2016-01-13,25.08,25.297501,24.325001,24.3475,22.557264,249758400,-0.02571,24.572868,24.581663,0.0,...,1.0,0.0,1.0,,,,,0.0,0.0,0.0


### Construct the Dependent Variable

In [24]:
# Construct the dependent variable where if daily return is greater than 0, then 1, else, 0.
trading_signals_df['Positive Return'] = np.where(trading_signals_df['daily_return'] > 0, 1.0, 0.0)
trading_signals_df.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,daily_return,fast_close,slow_close,crossover_long,...,vol_trend_short,vol_trend_signal,bollinger_mid_band,bollinger_std,bollinger_upper_band,bollinger_lower_band,bollinger_long,bollinger_short,bollinger_signal,Positive Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-07,24.67,25.032499,24.1075,24.112499,22.339539,324377600,-0.042205,24.606984,24.640067,0.0,...,0.0,0.0,,,,,0.0,0.0,0.0,0.0
2016-01-08,24.637501,24.7775,24.190001,24.24,22.457672,283192000,0.005288,24.467362,24.504858,0.0,...,0.0,0.0,,,,,0.0,0.0,0.0,1.0
2016-01-11,24.7425,24.764999,24.334999,24.6325,22.821304,198957600,0.016192,24.517584,24.537435,0.0,...,0.0,0.0,,,,,0.0,0.0,0.0,1.0
2016-01-12,25.137501,25.172501,24.709999,24.99,23.152521,196616800,0.014513,24.639892,24.630475,1.0,...,0.0,1.0,,,,,0.0,0.0,0.0,1.0
2016-01-13,25.08,25.297501,24.325001,24.3475,22.557264,249758400,-0.02571,24.572868,24.581663,0.0,...,0.0,1.0,,,,,0.0,0.0,0.0,0.0
2016-01-14,24.49,25.120001,23.934999,24.879999,23.050606,252680400,0.021871,24.636883,24.626075,1.0,...,0.0,1.0,,,,,0.0,0.0,0.0,1.0
2016-01-15,24.049999,24.4275,23.84,24.282499,22.49704,319335600,-0.024015,24.568426,24.581016,0.0,...,0.0,1.0,,,,,0.0,0.0,0.0,0.0
2016-01-19,24.602501,24.6625,23.875,24.165001,22.388184,212350800,-0.004839,24.495164,24.532188,0.0,...,0.0,1.0,,,,,0.0,0.0,0.0,0.0
2016-01-20,23.775,24.547501,23.355,24.1975,22.418295,289337600,0.001345,24.443787,24.496593,0.0,...,0.0,1.0,,,,,0.0,0.0,0.0,1.0
2016-01-21,24.264999,24.469999,23.735001,24.075001,22.3048,208646000,-0.005062,24.382768,24.455555,0.0,...,0.0,1.0,,,,,0.0,0.0,0.0,0.0


### Assign Training and Testing Windows

In [25]:
# Construct training start and end dates
training_start = trading_signals_df.index.min().strftime(format= '%Y-%m-%d')
training_end = '2019-05-07'

# Construct testing start and end dates
testing_start =  '2019-05-08'
testing_end = trading_signals_df.index.max().strftime(format= '%Y-%m-%d')

# Print training and testing start/end dates
print(f"Training Start: {training_start}")
print(f"Training End: {training_end}")
print(f"Testing Start: {testing_start}")
print(f"Testing End: {testing_end}")

Training Start: 2016-01-07
Training End: 2019-05-07
Testing Start: 2019-05-08
Testing End: 2021-01-05


### Separate X and y Training Datasets

In [26]:
# Construct the X_train and y_train datasets
X_train = trading_signals_df[x_var_list][training_start:training_end]
y_train = trading_signals_df['Positive Return'][training_start:training_end]

X_train.tail()

Unnamed: 0_level_0,crossover_signal,vol_trend_signal,bollinger_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-01,1.0,1.0,-1.0
2019-05-02,1.0,-1.0,-1.0
2019-05-03,1.0,-1.0,-1.0
2019-05-06,1.0,1.0,-1.0
2019-05-07,1.0,1.0,-1.0


In [27]:
y_train.tail()

Date
2019-05-01    1.0
2019-05-02    0.0
2019-05-03    1.0
2019-05-06    0.0
2019-05-07    0.0
Name: Positive Return, dtype: float64

### Separate X and y Testing Datasets

In [28]:
# Construct the X test and y test datasets
X_test = trading_signals_df[x_var_list][testing_start:testing_end]
y_test = trading_signals_df['Positive Return'][testing_start:testing_end]

X_test.tail()

Unnamed: 0_level_0,crossover_signal,vol_trend_signal,bollinger_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-29,1.0,1.0,-1.0
2020-12-30,1.0,1.0,-1.0
2020-12-31,1.0,1.0,-1.0
2021-01-04,1.0,1.0,-1.0
2021-01-05,1.0,1.0,-1.0


In [29]:
y_test.tail()

Date
2020-12-29    0.0
2020-12-30    0.0
2020-12-31    0.0
2021-01-04    0.0
2021-01-05    1.0
Name: Positive Return, dtype: float64

### Import SKLearn Library and Classes

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

### Train Random Forest Model 

In [31]:
# Fit a SKLearn linear regression using just the training set (X_train, Y_train)
# Initiate and fit model:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0) # Max Depth was 3
model.fit(X_train, y_train)

# Make a prediction of "y" values from the X_test dataset
predictions = model.predict(X_test)

# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results["Actual Value"] = predictions
Results.head(15)

Unnamed: 0_level_0,Positive Return,Actual Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-05-08,1.0,1.0
2019-05-09,0.0,0.0
2019-05-10,0.0,0.0
2019-05-13,0.0,1.0
2019-05-14,1.0,1.0
2019-05-15,1.0,1.0
2019-05-16,0.0,1.0
2019-05-17,0.0,1.0
2019-05-20,0.0,1.0
2019-05-21,1.0,1.0


### Save Pre-Trained Model Using Joblib

In [32]:
# Save the pre-trained model
from joblib import dump, load
dump(model, 'random_forest_model-AAPL.joblib')

['random_forest_model-AAPL.joblib']

### Save X_test and Results dataframes as CSV files

In [33]:
X_test.to_csv(r'X_test.csv')

In [34]:
Results.to_csv(r'results.csv')