In [81]:
import investpy

In [82]:
# Data manipulation modules
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import yfinance as yf

# Data visualization modules
import matplotlib.pyplot as plt

# Machine Learning modules
from sklearn.ensemble import RandomForestClassifier

# Cross-validation modules 
from sklearn.model_selection import TimeSeriesSplit

# Grid search module
from sklearn.model_selection import GridSearchCV

# Evaluation metrics modules
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [83]:
def get_train_test_data(cripto_series, test_percentage):
    '''This functions creates the training and test data for the stock time series'''
    time_series_length = len(cripto_series)
    test_series = cripto_series.iloc[int(time_series_length*(1-test_percentage)):time_series_length]
    train_series = cripto_series.iloc[:-int(time_series_length*test_percentage)]
    
    return(test_series, train_series)

def get_rate_of_change(cripto_series, variable, n_days):
    '''This function computes several values for the rate of change and add them into a dataset'''
    for day in n_days: 
        variable_name = f"{day}_days_of_change"
        cripto_series[variable_name] = (cripto_series[variable] - cripto_series[variable].shift(day)) / cripto_series[variable].shift(day)
        
def get_moving_average(cripto_series, variable, n_days):
    '''This function computes several values for the moving average and add them into a dataset'''
    for day in n_days:
        variable_name = f"{day}_day_moving_average"
        cripto_series[variable_name] = cripto_series[variable].rolling(day).mean()
        
def get_ratio(cripto_series, variable_1, variable_2):
    '''This function computes the ratio between two variables and add it into a dataset'''
    variable_name = f"{variable_1}_{variable_2}_ratio"
    cripto_series[variable_name] = cripto_series[variable_1] / cripto_series[variable_2]
    
def get_difference(cripto_series, variable_1, variable_2):
    '''This function computes the difference between the values of two variables and add it into a dataset'''
    variable_name = f"{variable_1}_{variable_2}_difference"
    cripto_series[variable_name] = cripto_series[variable_1] - cripto_series[variable_2] 
    
def train_and_predict(model, train_predictors, train_target, test_predictors):
    '''This function fits a model and then make predictions on it for a given amount of data'''
    model.fit(train_predictors, train_target)
    preds = model.predict(test_predictors)
    preds = pd.Series(preds, index=test_series_2.index)
    return preds

In [84]:
stock_series = investpy.stocks.get_stock_historical_data('PETR4', 'brazil', from_date = '01/01/2002', to_date='01/08/2022')

In [85]:
stock_series.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Currency
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2002-01-02,4.73,4.79,4.72,4.73,23979552,BRL
2002-01-03,4.75,4.8,4.74,4.76,39124800,BRL
2002-01-04,4.76,4.76,4.67,4.73,21548832,BRL
2002-01-07,4.71,4.87,4.71,4.83,40547104,BRL
2002-01-08,4.83,4.83,4.77,4.8,34469888,BRL


In [86]:
test_series, train_series = get_train_test_data(stock_series, test_percentage=0.1)

## Applying the new variables

In [87]:
train_series['target']  = np.array(train_series['Close'].shift(-1) > train_series['Close']).astype(int) 

In [88]:
get_rate_of_change(train_series, 'Close', [3, 5, 7, 15])

In [89]:
get_difference(train_series, 'Close', 'Open')
get_difference(train_series, 'High', 'Low')

In [90]:
train_series['low_dif'] = (train_series['Low'] - train_series['Low'].shift(1))
train_series['high_dif'] = train_series['High'] - train_series['High'].shift(1)

In [49]:
get_ratio(train_series, 'Open', 'Close')
get_ratio(train_series, 'High', 'Close')
get_ratio(train_series, 'Low', 'Close')

In [50]:
train_series.drop('Currency', axis=1, inplace=True)

In [51]:
train_series.dropna(axis=0, inplace=True)

In [52]:
test_series['target']  = np.array(test_series['Close'].shift(-1) > test_series['Close']).astype(int) 

In [53]:
get_rate_of_change(test_series, 'Close', [3, 5, 7, 15])

In [54]:
get_difference(test_series, 'Close', 'Open')
get_difference(test_series, 'High', 'Low')

In [55]:
test_series['low_dif'] = (test_series['Low'] - test_series['Low'].shift(1))
test_series['high_dif'] = test_series['High'] - test_series['High'].shift(1)

In [56]:
get_ratio(test_series, 'Open', 'Close')
get_ratio(test_series, 'High', 'Close')
get_ratio(test_series, 'Low', 'Close')

In [57]:
test_series.drop('Currency', axis=1, inplace=True)

In [58]:
test_series.dropna(axis=0, inplace=True)

In [62]:
print(train_series.columns)
print(test_series.columns)

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'low_dif', 'high_dif',
       'target', '3_days_of_change', '5_days_of_change', '7_days_of_change',
       '15_days_of_change', 'Close_Open_difference', 'High_Low_difference',
       'Open_Close_ratio', 'High_Close_ratio', 'Low_Close_ratio'],
      dtype='object')
Index(['Open', 'High', 'Low', 'Close', 'Volume', 'low_dif', 'high_dif',
       'target', '3_days_of_change', '5_days_of_change', '7_days_of_change',
       '15_days_of_change', 'Close_Open_difference', 'High_Low_difference',
       'Open_Close_ratio', 'High_Close_ratio', 'Low_Close_ratio'],
      dtype='object')


In [77]:
predictors = ['Open', 'High', 'Low', 'Close', 'Volume',
       '3_days_of_change', '5_days_of_change', '7_days_of_change',
       '15_days_of_change', 'Close_Open_difference', 'High_Low_difference',
       'low_dif', 'high_dif', 'Open_Close_ratio', 'High_Close_ratio',
       'Low_Close_ratio']

In [78]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit()

from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [100, 200, 300, 400, 500], 'max_depth': [400, 500, 600], 'min_samples_split': [100, 200, 300]}
]

forest_class = RandomForestClassifier()

grid_search = GridSearchCV(forest_class, param_grid, cv=tscv,
                           scoring='precision')

grid_search.fit(train_series[predictors], train_series['target'])

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
             estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [400, 500, 600],
                          'min_samples_split': [100, 200, 300],
                          'n_estimators': [100, 200, 300, 400, 500]}],
             scoring='precision')

In [79]:
grid_search.best_params_

{'max_depth': 400, 'min_samples_split': 300, 'n_estimators': 100}

In [80]:
model = RandomForestClassifier(n_estimators=300, min_samples_split = 300, max_depth=400)

In [68]:
model.fit(train_series[predictors], train_series['target'])

RandomForestClassifier(max_depth=400, min_samples_split=300, n_estimators=300,
                       random_state=True)

In [69]:
preds = model.predict_proba(test_series[predictors])
preds = preds[:, [1]].reshape(-1)
preds = pd.Series(preds, index=test_series.index)

In [71]:
preds[preds >= 0.54] = 1
preds[preds < 0.54] = 0

In [72]:
precision_score(test_series['target'], preds)

0.6491228070175439

In [73]:
recall_score(test_series['target'], preds)

0.1450980392156863

In [74]:
accuracy_score(test_series['target'], preds)

0.5191919191919192

In [75]:
confusion_matrix(test_series['target'], preds)

array([[220,  20],
       [218,  37]], dtype=int64)

In [76]:
preds

Date
2020-08-04    0.0
2020-08-05    0.0
2020-08-06    0.0
2020-08-07    0.0
2020-08-10    0.0
             ... 
2022-07-26    0.0
2022-07-27    0.0
2022-07-28    0.0
2022-07-29    0.0
2022-08-01    0.0
Length: 495, dtype: float64