In [1]:
import numpy as np
import pandas as pd
import indicators as ind
import matplotlib.pyplot as plt
import pandas_ta as ta
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

### Function that returns technical indicators value

In [24]:
def add_features(df:pd.DataFrame):
    df['sma_16'] = ta.sma(df.close, length=16)
    df['ema_16'] = ta.ema(df.close, length=16)
    a = ta.macd(df.close)
    df.join(a)
    a = ta.adx(df.high,df.low,df.close)
    df.join(a)
    df['rsi_16'] = ta.rsi(df.close, length=16)
    a = ta.bbands(df.close)
    df.join(a)
    df["cci_16"] = ta.cci(df.high, df.low, df.close, length=16)
    df["atr"] = ta.atr(df.high, df.low, df.close, length=16)
    a = ta.stoch(df.high, df.low, df.close)
    df = df.join(a)
    a = ta.stochrsi(df.close, length=16)
    df = df.join(a)
    df["wpr"] = ta.willr(df.high, df.low, df.close, length=16)
    df['curr_rets'] = df.close.pct_change(periods=2).mul(100)
    # df['fwd_rets'] = df.close.shift(-1).pct_change().mul(100)
    df['target_signal'] = np.where(df['curr_rets'] > 1.5, 1, 0)
    df['target_signal'] = np.where(df['curr_rets'] < -1.5, -1, df['target_signal'])
    return df

In [25]:
stock_data = pd.read_csv('../data/S & P 50/AAPL.csv')
stock_data.head()

Unnamed: 0,date,open,high,low,close,volume
0,2018-01-02 00:00:00.000000,42.540001,43.075001,42.314999,43.064999,102223600.0
1,2018-01-03 00:00:00.000000,43.1325,43.637501,42.990002,43.057499,118071600.0
2,2018-01-04 00:00:00.000000,43.134998,43.3675,43.02,43.2575,89738400.0
3,2018-01-05 00:00:00.000000,43.360001,43.842499,43.262501,43.75,94640000.0
4,2018-01-08 00:00:00.000000,43.587502,43.9025,43.482498,43.587502,82271200.0


In [26]:
new_df = add_features(stock_data)

In [27]:
new_df.head(20)

Unnamed: 0,date,open,high,low,close,volume,sma_16,ema_16,rsi_16,cci_16,atr,STOCHk_14_3_3,STOCHd_14_3_3,STOCHRSIk_16_14_3_3,STOCHRSId_16_14_3_3,wpr,curr_rets,fwd_rets,target_signal
0,2018-01-02 00:00:00.000000,42.540001,43.075001,42.314999,43.064999,102223600.0,,,,,,,,,,,,,0
1,2018-01-03 00:00:00.000000,43.1325,43.637501,42.990002,43.057499,118071600.0,,,,,,,,,,,-0.017415,0.464497,1
2,2018-01-04 00:00:00.000000,43.134998,43.3675,43.02,43.2575,89738400.0,,,,,,,,,,,0.464497,1.138532,0
3,2018-01-05 00:00:00.000000,43.360001,43.842499,43.262501,43.75,94640000.0,,,,,,,,,,,1.138532,-0.371425,-1
4,2018-01-08 00:00:00.000000,43.587502,43.9025,43.482498,43.587502,82271200.0,,,,,,,,,,,-0.371425,-0.011474,0
5,2018-01-09 00:00:00.000000,43.637501,43.764999,43.352501,43.5825,86336000.0,,,,,,,,,,,-0.011474,-0.02295,0
6,2018-01-10 00:00:00.000000,43.290001,43.575001,43.25,43.572498,95839600.0,,,,,,,,,,,-0.02295,0.568022,1
7,2018-01-11 00:00:00.000000,43.647499,43.872501,43.622501,43.82,74670800.0,,,,,,,,,,,0.568022,1.032632,0
8,2018-01-12 00:00:00.000000,44.044998,44.34,43.912498,44.272499,101672400.0,,,,,,,,,,,1.032632,-0.508213,-1
9,2018-01-16 00:00:00.000000,44.474998,44.8475,44.035,44.047501,118263600.0,,,,,,,,,,,-0.508213,1.651628,1


In [28]:
new_df['target_signal'].value_counts()

 0    622
 1    319
-1    318
Name: target_signal, dtype: int64

In [29]:
new_df.dropna(inplace=True)

In [30]:
new_df.index = pd.to_datetime(new_df['date'])
new_df.drop('date',axis=1,inplace=True)

## Data Preprocessing

In [32]:
X = new_df.drop(['curr_rets','fwd_rets','target_signal'],axis=1)
y = new_df['target_signal']

In [33]:
X.head()

Unnamed: 0_level_0,open,high,low,close,volume,sma_16,ema_16,rsi_16,cci_16,atr,STOCHk_14_3_3,STOCHd_14_3_3,STOCHRSIk_16_14_3_3,STOCHRSId_16_14_3_3,wpr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-02-20,43.012501,43.564999,42.855,42.962502,135722000.0,41.139844,41.785292,55.07939,115.333815,1.092941,92.585331,86.851565,97.74723,94.837429,-12.082998
2018-02-21,43.2075,43.529999,42.752499,42.767502,149886400.0,41.188437,41.900846,53.840908,100.650686,1.070754,87.645762,89.917186,95.130542,97.382006,-15.256305
2018-02-22,42.950001,43.487499,42.927502,43.125,123967600.0,41.274844,42.044864,55.785095,96.255793,1.046274,87.7407,89.323931,94.910879,95.929551,-9.438597
2018-02-23,43.4175,43.912498,43.384998,43.875,135249600.0,41.400938,42.260174,59.593541,108.381565,1.028345,91.571601,88.986021,96.431894,95.491105,-0.590295
2018-02-26,44.087502,44.8475,44.052502,44.7425,152648800.0,41.575781,42.552212,63.475092,124.071017,1.024501,96.176764,91.829688,99.048582,96.797118,-1.440817


In [34]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [35]:
X_scaled[:3]

array([[-1.22893437, -1.22751541, -1.22322327, -1.2312537 ,  0.37046771,
        -1.25805351, -1.24588877, -0.01576356,  0.79805383, -1.01492748,
         1.11775013,  0.94423046,  1.33058779,  1.28834922,  0.8763833 ],
       [-1.22469376, -1.22826693, -1.22548232, -1.23549369,  0.63210945,
        -1.25699751, -1.24337339, -0.11931459,  0.66434701, -1.02988278,
         0.94831221,  1.05228156,  1.25149807,  1.36820708,  0.77521276],
       [-1.23029354, -1.22917949, -1.22162538, -1.22772041,  0.15334298,
        -1.25511979, -1.24023842,  0.04324126,  0.62432645, -1.04638308,
         0.95156879,  1.03137166,  1.2448587 ,  1.32262387,  0.96069143]])

## Random Forest Classifier

In [36]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, random_state=10)

In [37]:
X_test.shape[0]/X_train.shape[0]

0.3340587595212187

In [38]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=10)
rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.4560260586319218


## Bagging Classifier with LR Estimator

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(base_estimator=LogisticRegression()
                             ,n_estimators=100
                             ,max_samples=0.8
                             ,oob_score=True
                             ,random_state=10)
bag_model.fit(X_train, y_train)
bag_model.oob_score_


0.5168661588683352

In [40]:
predictions = bag_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.4755700325732899


In [33]:
import joblib

# Saving the trained model to a file
joblib.dump(bag_model, 'bagging_logistic_regression.pkl')



['bagging_logistic_regression.pkl']

## Bagging Classifier with DTC estimator

In [41]:
from sklearn.tree import DecisionTreeClassifier

bag_model = BaggingClassifier(base_estimator=DecisionTreeClassifier()
                             ,n_estimators=100
                             ,max_samples=0.8
                             ,oob_score=True
                             ,random_state=10)
bag_model.fit(X_train, y_train)
bag_model.oob_score_

0.4504896626768226