In [15]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (7,4.5) # Make the default figures a bit bigger

import numpy as np
import random

#Let's make this notebook reproducible 
np.random.seed(42)
random.seed(42)
import pandas_technical_indicators as ta

import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, precision_score, confusion_matrix, recall_score, accuracy_score
from sklearn.model_selection import train_test_split

In [16]:
df = pd.read_csv("apple1")

In [17]:
apple = df.drop(['Unnamed: 0'], axis = 1)

In [18]:

def get_exp_preprocessing(df, alpha=0.9):
    edata = df.ewm(alpha=alpha).mean()    
    return edata

In [19]:
saapl = get_exp_preprocessing(apple)
saapl.head() #saapl stands for smoothed aapl

Unnamed: 0,Open,Low,High,Close,Volume
0,30.49,30.34,30.642857,30.572857,123432400.0
1,30.641948,30.452986,30.784415,30.620908,148017700.0
2,30.627322,30.141416,30.750836,30.18637,139028800.0
3,30.287698,29.891974,30.332185,30.093199,121255600.0
4,30.067338,29.868341,30.290362,30.263894,112837900.0


In [20]:
#Feature Extraction based on Pandas Time Series
def feature_extraction(data):
    for x in [5, 14, 26, 44, 66]:
        data = ta.relative_strength_index(data, n=x)
        data = ta.stochastic_oscillator_d(data, n=x)
        data = ta.accumulation_distribution(data, n=x)
        data = ta.average_true_range(data, n=x)
        data = ta.momentum(data, n=x)
        data = ta.money_flow_index(data, n=x)
        data = ta.rate_of_change(data, n=x)
        data = ta.on_balance_volume(data, n=x)
        data = ta.commodity_channel_index(data, n=x)
        data = ta.ease_of_movement(data, n=x)
        data = ta.trix(data, n=x)
        data = ta.vortex_indicator(data, n=x)
    
    data['ema50'] = data['Close'] / data['Close'].ewm(50).mean()
    data['ema21'] = data['Close'] / data['Close'].ewm(21).mean()
    data['ema14'] = data['Close'] / data['Close'].ewm(14).mean()
    data['ema5'] = data['Close'] / data['Close'].ewm(5).mean()
        
    #Williams %R is missing
    data = ta.macd(data, n_fast=12, n_slow=26)
    
    del(data['Open'])
    del(data['High'])
    del(data['Low'])
    del(data['Volume'])
    
    return data
   
def compute_prediction_int(df, n):
    pred = (df.shift(-n)['Close'] >= df['Close'])
    pred = pred.iloc[:-n]
    return pred.astype(int)

def prepare_data(df, horizon):
    data = feature_extraction(df).dropna().iloc[:-horizon]
    data['pred'] = compute_prediction_int(data, n=horizon)
    del(data['Close'])
    return data.dropna()

In [7]:
#Create the feature
data = prepare_data(saapl, 10)

In [8]:
y = data['pred']

#remove the output from the input
features = [x for x in data.columns if x not in ['gain', 'pred']]
X = data[features]

In [9]:
# Based on train test split (note )
train_size = 2*len(X) // 3

X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

In [10]:

print('len X_train', len(X_train))
print('len y_train', len(y_train))
print('len X_test', len(X_test))
print('len y_test', len(y_test))

len X_train 1572
len y_train 1572
len X_test 787
len y_test 787


In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

## xgb = XGBClassifier
rf = xgb(n_jobs=-1, n_estimators=10,  scale_pos_weight=0, random_state=42)
rf.fit(X_train, y_train.values.ravel());

In [12]:
 xgb = XGBClassifier
rf = xgb(n_jobs=-1, n_estimators=25,  scale_pos_weight=10, random_state=42)
rf.fit(X, y.values.ravel());

In [13]:
# Evaluate Predictions
pred = rf.predict(X_test)
precision = precision_score(y_pred=pred, y_true=y_test)
recall = recall_score(y_pred=pred, y_true=y_test)
f1 = f1_score(y_pred=pred, y_true=y_test)
accuracy = accuracy_score(y_pred=pred, y_true=y_test)
confusion = confusion_matrix(y_pred=pred, y_true=y_test)
classication_report = classification_report(y_pred=pred, y_true=y_test)
print('precision: {0:1.2f}, recall: {1:1.2f}, f1: {2:1.2f}, accuracy: {3:1.2f}'.format(precision, recall, f1, accuracy))
print('Confusion Matrix')
print(classication_report)                                         
print(confusion)

precision: 0.71, recall: 1.00, f1: 0.83, accuracy: 0.73
Confusion Matrix
              precision    recall  f1-score   support

         0.0       1.00      0.23      0.37       274
         1.0       0.71      1.00      0.83       513

    accuracy                           0.73       787
   macro avg       0.85      0.61      0.60       787
weighted avg       0.81      0.73      0.67       787

[[ 63 211]
 [  0 513]]


In [14]:
data = prepare_data(saapl, 5)