In [1]:
import numpy as np
import pandas as pd
import indicators as ind
import matplotlib.pyplot as plt
import pandas_ta as ta
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

### Function that returns technical indicators value

In [18]:
def add_features(df:pd.DataFrame):
    df['sma_16'] = ta.sma(df.close, length=16)
    df['ema_16'] = ta.ema(df.close, length=16)
    a = ta.macd(df.close)
    df.join(a)
    a = ta.adx(df.high,df.low,df.close)
    df.join(a)
    df['rsi_16'] = ta.rsi(df.close, length=16)
    a = ta.bbands(df.close)
    df.join(a)
    df["cci_16"] = ta.cci(df.high, df.low, df.close, length=16)
    df["atr"] = ta.atr(df.high, df.low, df.close, length=16)
    a = ta.stoch(df.high, df.low, df.close)
    df = df.join(a)
    a = ta.stochrsi(df.close, length=16)
    df = df.join(a)
    df["wpr"] = ta.willr(df.high, df.low, df.close, length=16)
    df['returns'] = df.close.pct_change(periods=3).mul(100)
    df['target_signal'] = np.where((df.returns > 2), 1, 0)
    df['target_signal'] = np.where((df.returns < -2), -1, df['target_signal'])
    return df

In [19]:
stock_data = pd.read_csv('../data/S & P 50/AAPL.csv')
stock_data.head()

Unnamed: 0,date,open,high,low,close,volume
0,2018-01-02 00:00:00.000000,42.540001,43.075001,42.314999,43.064999,102223600.0
1,2018-01-03 00:00:00.000000,43.1325,43.637501,42.990002,43.057499,118071600.0
2,2018-01-04 00:00:00.000000,43.134998,43.3675,43.02,43.2575,89738400.0
3,2018-01-05 00:00:00.000000,43.360001,43.842499,43.262501,43.75,94640000.0
4,2018-01-08 00:00:00.000000,43.587502,43.9025,43.482498,43.587502,82271200.0


In [20]:
new_df = add_features(stock_data)

In [21]:
new_df.head(20)

Unnamed: 0,date,open,high,low,close,volume,sma_16,ema_16,rsi_16,cci_16,atr,STOCHk_14_3_3,STOCHd_14_3_3,STOCHRSIk_16_14_3_3,STOCHRSId_16_14_3_3,wpr,returns,target_signal
0,2018-01-02 00:00:00.000000,42.540001,43.075001,42.314999,43.064999,102223600.0,,,,,,,,,,,,0
1,2018-01-03 00:00:00.000000,43.1325,43.637501,42.990002,43.057499,118071600.0,,,,,,,,,,,,0
2,2018-01-04 00:00:00.000000,43.134998,43.3675,43.02,43.2575,89738400.0,,,,,,,,,,,,0
3,2018-01-05 00:00:00.000000,43.360001,43.842499,43.262501,43.75,94640000.0,,,,,,,,,,,1.590622,0
4,2018-01-08 00:00:00.000000,43.587502,43.9025,43.482498,43.587502,82271200.0,,,,,,,,,,,1.230918,0
5,2018-01-09 00:00:00.000000,43.637501,43.764999,43.352501,43.5825,86336000.0,,,,,,,,,,,0.751317,0
6,2018-01-10 00:00:00.000000,43.290001,43.575001,43.25,43.572498,95839600.0,,,,,,,,,,,-0.405718,0
7,2018-01-11 00:00:00.000000,43.647499,43.872501,43.622501,43.82,74670800.0,,,,,,,,,,,0.533406,0
8,2018-01-12 00:00:00.000000,44.044998,44.34,43.912498,44.272499,101672400.0,,,,,,,,,,,1.583201,0
9,2018-01-16 00:00:00.000000,44.474998,44.8475,44.035,44.047501,118263600.0,,,,,,,,,,,1.090142,0


In [22]:
new_df['target_signal'].value_counts()

target_signal
 0    647
 1    356
-1    256
Name: count, dtype: int64

In [23]:
new_df.dropna(inplace=True)

In [24]:
new_df.index = pd.to_datetime(new_df['date'])
new_df.drop('date',axis=1,inplace=True)

## Data Preprocessing

In [25]:
X = new_df.drop(['returns','target_signal'],axis=1)
y = new_df['target_signal']

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [27]:
X_scaled[:3]

array([[-1.22893437, -1.22751541, -1.22322327, -1.2312537 ,  0.37046771,
        -1.25805351, -1.24588877,  0.05278629,  0.79805383, -1.05950383,
         1.11775013,  0.94423046,  1.32377801,  1.17471435,  0.8763833 ],
       [-1.22469376, -1.22826693, -1.22548232, -1.23549369,  0.63210945,
        -1.25699751, -1.24337339, -0.0477351 ,  0.66434701, -1.06993671,
         0.94831221,  1.05228156,  1.23574398,  1.31972776,  0.77521276],
       [-1.23029354, -1.22917949, -1.22162538, -1.22772041,  0.15334298,
        -1.25511979, -1.24023842,  0.10507618,  0.62432645, -1.08213872,
         0.95156879,  1.03137166,  1.22685795,  1.30894222,  0.96069143]])

## Random Forest Classifier

In [28]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, random_state=10)

In [29]:
X_test.shape[0]/X_train.shape[0]

0.3340587595212187

In [30]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=10)
rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7817589576547231


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(base_estimator=LogisticRegression()
                             ,n_estimators=100
                             ,max_samples=0.8
                             ,oob_score=True
                             ,random_state=10)
bag_model.fit(X_train, y_train)
bag_model.oob_score_


0.8204570184983678

In [32]:
predictions = bag_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8143322475570033


In [33]:
import joblib

# Saving the trained model to a file
joblib.dump(bag_model, 'bagging_logistic_regression.pkl')



['bagging_logistic_regression.pkl']

In [52]:
from sklearn.tree import DecisionTreeClassifier

bag_model = BaggingClassifier(base_estimator=DecisionTreeClassifier()
                             ,n_estimators=100
                             ,max_samples=0.8
                             ,oob_score=True
                             ,random_state=10)
bag_model.fit(X_train, y_train)
bag_model.oob_score_

0.7475516866158868