In [17]:
# Import libraries and dependencies
import pandas as pd
import numpy as np
from pathlib import Path
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [29]:
csv_path = Path('SMA_Analysis/Stocks/ABBV.csv')
abbv_df=pd.read_csv(csv_path)
abbv_df = abbv_df.set_index(pd.DatetimeIndex(abbv_df['Date'].values))
abbv_df['daily_return'] = abbv_df['Close'].dropna().pct_change()
abbv_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,daily_return
2020-01-16,2020-01-16,84.776988,85.242123,84.406784,84.72953,7073200,0.0,0,
2020-01-17,2020-01-17,84.606121,84.739015,83.334137,83.533478,10122300,0.0,0,-0.014116
2020-01-21,2020-01-21,83.229723,84.03658,83.125306,83.523987,13552000,0.0,0,-0.000114
2020-01-22,2020-01-22,83.561949,83.694843,82.726617,82.783569,6438900,0.0,0,-0.008865
2020-01-23,2020-01-23,82.6317,82.650682,80.676252,80.942039,12266800,0.0,0,-0.022245


In [30]:
#Calculate MACD and Signal line indicators
#Calcualte short term exponential moving average (EMA)
ShortEMA = abbv_df.Close.ewm(span=12, adjust=False).mean()
#Calculate the long term EMA
LongEMA = abbv_df.Close.ewm(span=26, adjust=False).mean()
#Calculate MACD
MACD = ShortEMA - LongEMA
#calculate signal line
signal = MACD.ewm(span=9, adjust=False).mean()
#create new columns for data
abbv_df['ShortEMA'] = ShortEMA
abbv_df['LongEMA'] = LongEMA
abbv_df['MACD'] = MACD
abbv_df['Signal Line'] = signal
#show the data
abbv_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,daily_return,ShortEMA,LongEMA,MACD,Signal Line
2020-01-16,2020-01-16,84.776988,85.242123,84.406784,84.72953,7073200,0.0,0,,84.72953,84.72953,0.0,0.0
2020-01-17,2020-01-17,84.606121,84.739015,83.334137,83.533478,10122300,0.0,0,-0.014116,84.545522,84.640934,-0.095412,-0.019082
2020-01-21,2020-01-21,83.229723,84.03658,83.125306,83.523987,13552000,0.0,0,-0.000114,84.388363,84.558197,-0.169834,-0.049233
2020-01-22,2020-01-22,83.561949,83.694843,82.726617,82.783569,6438900,0.0,0,-0.008865,84.141472,84.426743,-0.285272,-0.09644
2020-01-23,2020-01-23,82.6317,82.650682,80.676252,80.942039,12266800,0.0,0,-0.022245,83.649251,84.168617,-0.519366,-0.181025


In [31]:
# Construct a crossover trading signal
abbv_df['crossover_long'] = np.where(abbv_df['MACD'] > abbv_df['Signal Line'], 1.0, 0.0)
abbv_df['crossover_short'] = np.where(abbv_df['MACD'] < abbv_df['Signal Line'], -1.0, 0.0)
abbv_df['MACD_Signal'] = abbv_df['crossover_long'] + abbv_df['crossover_short']
abbv_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,daily_return,ShortEMA,LongEMA,MACD,Signal Line,crossover_long,crossover_short,MACD_Signal
2020-01-16,2020-01-16,84.776988,85.242123,84.406784,84.72953,7073200,0.0,0,,84.72953,84.72953,0.0,0.0,0.0,0.0,0.0
2020-01-17,2020-01-17,84.606121,84.739015,83.334137,83.533478,10122300,0.0,0,-0.014116,84.545522,84.640934,-0.095412,-0.019082,0.0,-1.0,-1.0
2020-01-21,2020-01-21,83.229723,84.03658,83.125306,83.523987,13552000,0.0,0,-0.000114,84.388363,84.558197,-0.169834,-0.049233,0.0,-1.0,-1.0
2020-01-22,2020-01-22,83.561949,83.694843,82.726617,82.783569,6438900,0.0,0,-0.008865,84.141472,84.426743,-0.285272,-0.09644,0.0,-1.0,-1.0
2020-01-23,2020-01-23,82.6317,82.650682,80.676252,80.942039,12266800,0.0,0,-0.022245,83.649251,84.168617,-0.519366,-0.181025,0.0,-1.0,-1.0


In [21]:
 # Set x variable list of features
x_var_list = ['MACD_Signal']
abbv_df[x_var_list].head()

Unnamed: 0,MACD_Signal
2020-01-16,0.0
2020-01-17,-1.0
2020-01-21,-1.0
2020-01-22,-1.0
2020-01-23,-1.0


In [23]:
abbv_df[x_var_list] = abbv_df[x_var_list].shift(1)
abbv_df[x_var_list].head()

Unnamed: 0,MACD_Signal
2020-01-16,
2020-01-17,
2020-01-21,0.0
2020-01-22,-1.0
2020-01-23,-1.0


In [32]:
# Drop NAs and replace positive/negative infinity values
abbv_df.dropna(subset=x_var_list, inplace=True)
abbv_df.dropna(subset=['daily_return'], inplace=True)
abbv_df = abbv_df.replace([np.inf, -np.inf], np.nan)
abbv_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,daily_return,ShortEMA,LongEMA,MACD,Signal Line,crossover_long,crossover_short,MACD_Signal
2020-01-17,2020-01-17,84.606121,84.739015,83.334137,83.533478,10122300,0.0,0,-0.014116,84.545522,84.640934,-0.095412,-0.019082,0.0,-1.0,-1.0
2020-01-21,2020-01-21,83.229723,84.03658,83.125306,83.523987,13552000,0.0,0,-0.000114,84.388363,84.558197,-0.169834,-0.049233,0.0,-1.0,-1.0
2020-01-22,2020-01-22,83.561949,83.694843,82.726617,82.783569,6438900,0.0,0,-0.008865,84.141472,84.426743,-0.285272,-0.09644,0.0,-1.0,-1.0
2020-01-23,2020-01-23,82.6317,82.650682,80.676252,80.942039,12266800,0.0,0,-0.022245,83.649251,84.168617,-0.519366,-0.181025,0.0,-1.0,-1.0
2020-01-24,2020-01-24,81.093921,81.141386,78.559441,79.29985,13215400,0.0,0,-0.020288,82.980113,83.807968,-0.827855,-0.310391,0.0,-1.0,-1.0


In [34]:
# Construct the dependent variable where if daily return is greater than 0, then 1, else, 0.
abbv_df['Positive Return'] = np.where(abbv_df['daily_return'] > 0, 1.0, 0.0)
abbv_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,daily_return,ShortEMA,LongEMA,MACD,Signal Line,crossover_long,crossover_short,MACD_Signal,Positive Return
2020-01-17,2020-01-17,84.606121,84.739015,83.334137,83.533478,10122300,0.0,0,-0.014116,84.545522,84.640934,-0.095412,-0.019082,0.0,-1.0,-1.0,0.0
2020-01-21,2020-01-21,83.229723,84.03658,83.125306,83.523987,13552000,0.0,0,-0.000114,84.388363,84.558197,-0.169834,-0.049233,0.0,-1.0,-1.0,0.0
2020-01-22,2020-01-22,83.561949,83.694843,82.726617,82.783569,6438900,0.0,0,-0.008865,84.141472,84.426743,-0.285272,-0.09644,0.0,-1.0,-1.0,0.0
2020-01-23,2020-01-23,82.6317,82.650682,80.676252,80.942039,12266800,0.0,0,-0.022245,83.649251,84.168617,-0.519366,-0.181025,0.0,-1.0,-1.0,0.0
2020-01-24,2020-01-24,81.093921,81.141386,78.559441,79.29985,13215400,0.0,0,-0.020288,82.980113,83.807968,-0.827855,-0.310391,0.0,-1.0,-1.0,0.0


In [35]:
# Construct training start and end dates
training_start = abbv_df.index.min().strftime(format= '%Y-%m-%d')
training_end = '2021-01-15'

# Construct testing start and end dates
testing_start =  '2020-01-16'
testing_end = abbv_df.index.max().strftime(format= '%Y-%m-%d')

# Print training and testing start/end dates
print(f"Training Start: {training_start}")
print(f"Training End: {training_end}")
print(f"Testing Start: {testing_start}")
print(f"Testing End: {testing_end}")

Training Start: 2020-01-17
Training End: 2021-01-15
Testing Start: 2020-01-16
Testing End: 2021-01-15


In [37]:
X_train = abbv_df[x_var_list][training_start:training_end]
y_train = abbv_df['Positive Return'][training_start:training_end]

X_train.head()

Unnamed: 0,MACD_Signal
2020-01-17,-1.0
2020-01-21,-1.0
2020-01-22,-1.0
2020-01-23,-1.0
2020-01-24,-1.0


In [38]:
y_train.head()

2020-01-17    0.0
2020-01-21    0.0
2020-01-22    0.0
2020-01-23    0.0
2020-01-24    0.0
Name: Positive Return, dtype: float64

In [42]:
# Construct the X test and y test datasets
X_test = abbv_df[x_var_list][testing_start:testing_end]
y_test = abbv_df['Positive Return'][testing_start:testing_end]
X_test.head()

Unnamed: 0,MACD_Signal
2020-01-17,-1.0
2020-01-21,-1.0
2020-01-22,-1.0
2020-01-23,-1.0
2020-01-24,-1.0


In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [46]:
 # Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)
model.fit(X_train, y_train)

# Make a prediction of "y" values from the X_test dataset
predictions = model.predict(X_test)

# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results["Predicted Value"] = predictions
Results.head()

Unnamed: 0,Positive Return,Predicted Value
2020-01-17,0.0,0.0
2020-01-21,0.0,0.0
2020-01-22,0.0,0.0
2020-01-23,0.0,0.0
2020-01-24,0.0,0.0


In [47]:
# Save the pre-trained model
from joblib import dump, load
dump(model, 'random_forest_model.joblib')

['random_forest_model.joblib']