In [67]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import accuracy_score, classification_report
pd.set_option('display.max_columns',None)
import joblib

In [68]:
price_data = pd.read_parquet(f'C:/Users/Samsung/Documents/GitHub-Datas/TCC_Machine_Learning/Dados_bolsa_interpolar.parquet')
price_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker
0,2000-01-05,0.520882,0.520882,0.520882,0.520882,0.291125,985,ABEV3.SA
1,2000-01-06,0.494478,0.494478,0.494478,0.494478,0.276367,227,ABEV3.SA
2,2000-01-12,0.481293,0.481293,0.481293,0.481293,0.268998,1137,ABEV3.SA
3,2000-01-13,0.484589,0.484589,0.484589,0.484589,0.27084,606,ABEV3.SA
4,2000-01-14,0.494478,0.494478,0.494478,0.494478,0.276367,6445,ABEV3.SA


In [69]:
# Configuração inicial
price_data['Date'] = pd.to_datetime(price_data['Date'])
price_data.sort_values(by=['Ticker', 'Date'], inplace=True)

# Cálculo da mudança no preço
price_data['change_in_price'] = price_data['Close'].diff()
mask = price_data['Ticker'] != price_data['Ticker'].shift(1)
price_data['change_in_price'] = np.where(mask, np.nan, price_data['change_in_price'])
price_data[price_data.isna().any(axis = 1)]

# Função de suavização exponencial
def exponential_smoothing(data, alpha):
    smoothed = np.zeros(len(data))
    smoothed[0] = data[0]
    for t in range(1, len(data)):
        smoothed[t] = alpha * data[t] + (1 - alpha) * smoothed[t-1]
    return smoothed

# Função para calcular o target
def calculate_target(data, d):
    return np.sign(data.shift(-d) - data)

# Função para calcular o target e ajustar valores zero para -1
def calculate_target_verify(data, d):
    target = np.sign(data.shift(-d) - data)
    target[target == 0] = -1
    return target

def obv(group):

    Volume = group['Volume']
    change = group['Smoothed_Close'].diff()

    prev_obv = 0
    obv_values = []

    # Calculando o indicador
    for i, j in zip(change, Volume):

        if i > 0:
            current_obv = prev_obv + j
        elif i < 0:
            current_obv = prev_obv - j
        else:
            current_obv = prev_obv

        
        prev_obv = current_obv
        obv_values.append(current_obv)
    
    # Retornando em pandas series
    return pd.Series(obv_values, index = group.index)


# Parâmetros adicionais
n = 16
d = 1
e = 14
# Lista de parâmetros n para iterar
t_list = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ]  # Exemplo de diferentes valores de n
alpha = 0.01  # Mantemos um único valor de alpha


price_data['Smoothed_Close'] = price_data.groupby('Ticker')['Close'].transform(
    lambda x: exponential_smoothing(x.values, alpha)
)

# Calculando o target para d = 10 dias
d = 1
price_data['Prediction'] = price_data.groupby('Ticker')['Smoothed_Close'].transform(
    lambda x: calculate_target_verify(x, d)
)


for t in t_list:
    # Criação das variáveis low_t e high_t
    low_t, high_t = price_data[['Ticker', 'Low']].copy(), price_data[['Ticker', 'High']].copy()
    # Cálculo dos valores mínimos e máximos baseados na janela t
    low_t = low_t.groupby('Ticker')['Low'].transform(lambda x: x.rolling(window=t).min())
    high_t = high_t.groupby('Ticker')['High'].transform(lambda x: x.rolling(window=t).max())

    # Cálculo do k_percent
    k_percent = 100 * ((price_data['Smoothed_Close'] - low_t) / (high_t - low_t))

    # Armazenando os resultados no DataFrame com nomes dinâmicos
    price_data[f'low_{t}'] = low_t
    price_data[f'high_{t}'] = high_t
    price_data[f'k_percent_{t}'] = k_percent
    
    # Calculando o indicador
    r_percent = ((high_t - price_data['Smoothed_Close']) / (high_t - low_t)) * (- 100)
    price_data[f'r_percent_{t}'] = r_percent    

# Dias de alta e dias de baixa
up_df, down_df = price_data[['Ticker','change_in_price']].copy(), price_data[['Ticker','change_in_price']].copy()

# Salvando a diferença para dias de alta.
up_df.loc['change_in_price'] = up_df.loc[(up_df['change_in_price'] < 0), 'change_in_price'] = 0

# Salvando a diferença para dias de baixa
down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price'] > 0), 'change_in_price'] = 0

# Colocando as diferenças do dia de baixa em termos absolutos
down_df['change_in_price'] = down_df['change_in_price'].abs()


ewma_up = up_df.groupby('Ticker')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())
ewma_down = down_df.groupby('Ticker')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())

relative_strength = ewma_up / ewma_down

# Calculando o indicador
relative_strength_index = 100.0 - (100.0 / (1.0 + relative_strength))

price_data['down_days'] = down_df['change_in_price']
price_data['up_days'] = up_df['change_in_price']
price_data['RSI'] = relative_strength_index

# Aplicando a suavização exponencial
alpha_verify = 1
price_data['Smoothed_Close_1'] = price_data.groupby('Ticker')['Close'].transform(
    lambda x: exponential_smoothing(x.values, alpha_verify)
)

# Calculando o target para d = 10 dias
d = 10
price_data['Verify'] = price_data.groupby('Ticker')['Smoothed_Close_1'].transform(
    lambda x: calculate_target_verify(x, d)
)

# Price Rate of Change
price_data['Price_Rate_Of_Change'] = price_data.groupby('Ticker')['Smoothed_Close'].transform(lambda x: x.pct_change(periods = e))

# Calculando o MACD
ema_26 = price_data.groupby('Ticker')['Smoothed_Close'].transform(lambda x: x.ewm(span = 26).mean())
ema_12 = price_data.groupby('Ticker')['Smoothed_Close'].transform(lambda x: x.ewm(span = 12).mean())
macd = ema_12 - ema_26

# Calculando a EMA
ema_9_macd = macd.ewm(span = 9).mean()

price_data['MACD'] = macd
price_data['MACD_EMA'] = ema_9_macd

# aplicando a função em cada grupo
obv_groups = price_data.groupby('Ticker').apply(obv)
price_data['On Balance Volume'] = obv_groups.reset_index(level=0, drop=True)


# Aplicando o deslocamento de 10 linhas dentro de cada grupo de 'Ticker'
price_data['Close_10'] = price_data.groupby('Ticker')['Close'].shift(-d)

# Drop linhas com valores ausentes
price_data = price_data.dropna()

price_data.tail()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,change_in_price,Smoothed_Close,Prediction,low_10,high_10,k_percent_10,r_percent_10,low_11,high_11,k_percent_11,r_percent_11,low_12,high_12,k_percent_12,r_percent_12,low_13,high_13,k_percent_13,r_percent_13,low_14,high_14,k_percent_14,r_percent_14,low_15,high_15,k_percent_15,r_percent_15,low_16,high_16,k_percent_16,r_percent_16,low_17,high_17,k_percent_17,r_percent_17,low_18,high_18,k_percent_18,r_percent_18,low_19,high_19,k_percent_19,r_percent_19,low_20,high_20,k_percent_20,r_percent_20,down_days,up_days,RSI,Smoothed_Close_1,Verify,Price_Rate_Of_Change,MACD,MACD_EMA,On Balance Volume,Close_10
11769,2025-01-07,52.630001,52.889999,51.939999,52.049999,52.049999,22070200,VALE3.SA,-0.510002,60.726555,-1.0,51.939999,55.400002,253.94649,153.94649,51.939999,55.869999,223.576469,123.576469,51.939999,56.400002,197.007852,97.007852,51.939999,56.540001,191.01199,91.01199,51.939999,56.709999,184.204509,84.204509,51.939999,57.68,153.075843,53.075843,51.939999,59.59,114.856917,14.856917,51.939999,59.939999,109.831949,9.831949,51.939999,60.189999,106.503708,6.503708,51.939999,60.189999,106.503708,6.503708,51.939999,60.189999,106.503708,6.503708,0.510002,0.0,21.550853,52.049999,1.0,-0.015481,-0.343412,-0.305602,6199711036,54.02
11770,2025-01-08,52.029999,52.290001,51.43,51.549999,51.549999,24423800,VALE3.SA,-0.5,60.634789,-1.0,51.43,55.400002,231.858586,131.858586,51.43,55.400002,231.858586,131.858586,51.43,55.869999,207.315125,107.315125,51.43,56.400002,185.206971,85.206971,51.43,56.540001,180.13283,80.13283,51.43,56.709999,174.33316,74.33316,51.43,57.68,147.276619,47.276619,51.43,59.59,112.803785,12.803785,51.43,59.939999,108.164401,8.164401,51.43,60.189999,105.077517,5.077517,51.43,60.189999,105.077517,5.077517,0.5,0.0,19.471386,51.549999,1.0,-0.01604,-0.355839,-0.31565,6175287236,52.66
11771,2025-01-09,52.049999,52.25,51.139999,51.23,51.23,15865200,VALE3.SA,-0.32,60.540741,-1.0,51.139999,55.400002,220.674578,120.674578,51.139999,55.400002,220.674578,120.674578,51.139999,55.400002,220.674578,120.674578,51.139999,55.869999,198.747201,98.747201,51.139999,56.400002,178.721253,78.721253,51.139999,56.540001,174.08776,74.08776,51.139999,56.709999,168.774546,68.774546,51.139999,57.68,143.742208,43.742208,51.139999,59.59,111.251371,11.251371,51.139999,59.939999,106.82662,6.82662,51.139999,60.189999,103.875608,3.875608,0.32,0.0,18.197758,51.23,1.0,-0.016641,-0.369022,-0.326324,6159422036,52.32
11772,2025-01-10,51.540001,52.310001,51.23,51.52,51.52,25833900,VALE3.SA,0.290001,60.450534,-1.0,51.139999,55.400002,218.557034,118.557034,51.139999,55.400002,218.557034,118.557034,51.139999,55.400002,218.557034,118.557034,51.139999,55.400002,218.557034,118.557034,51.139999,55.869999,196.840068,96.840068,51.139999,56.400002,177.006284,77.006284,51.139999,56.540001,172.417253,72.417253,51.139999,56.709999,167.155024,67.155024,51.139999,57.68,142.36289,42.36289,51.139999,59.59,110.183828,10.183828,51.139999,59.939999,105.801535,5.801535,0.0,0.290001,23.347435,51.52,1.0,-0.017236,-0.382341,-0.337527,6133588136,53.029999
11773,2025-01-13,52.0,52.310001,50.869999,51.509998,51.509998,30845300,VALE3.SA,-0.010002,60.361128,-1.0,50.869999,55.189999,219.702085,119.702085,50.869999,55.400002,209.517086,109.517086,50.869999,55.400002,209.517086,109.517086,50.869999,55.400002,209.517086,109.517086,50.869999,55.400002,209.517086,109.517086,50.869999,55.869999,189.822588,89.822588,50.869999,56.400002,171.629746,71.629746,50.869999,56.540001,167.391995,67.391995,50.869999,56.709999,162.519335,62.519335,50.869999,57.68,139.370448,39.370448,50.869999,59.59,108.843212,8.843212,0.010002,0.0,23.290124,51.509998,1.0,-0.017619,-0.395551,-0.349132,6102742836,52.91


In [70]:
price_data = price_data[price_data['Ticker'].isin(['ABEV3.SA'])]
price_data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,change_in_price,Smoothed_Close,Prediction,low_10,high_10,k_percent_10,r_percent_10,low_11,high_11,k_percent_11,r_percent_11,low_12,high_12,k_percent_12,r_percent_12,low_13,high_13,k_percent_13,r_percent_13,low_14,high_14,k_percent_14,r_percent_14,low_15,high_15,k_percent_15,r_percent_15,low_16,high_16,k_percent_16,r_percent_16,low_17,high_17,k_percent_17,r_percent_17,low_18,high_18,k_percent_18,r_percent_18,low_19,high_19,k_percent_19,r_percent_19,low_20,high_20,k_percent_20,r_percent_20,down_days,up_days,RSI,Smoothed_Close_1,Verify,Price_Rate_Of_Change,MACD,MACD_EMA,On Balance Volume,Close_10
5736,2025-01-07,11.3,11.58,11.27,11.54,11.54,34905200,ABEV3.SA,0.25,12.685903,-1.0,11.21,12.91,86.817804,-13.182196,11.21,13.08,78.925272,-21.074728,11.21,13.19,74.540547,-25.459453,11.21,13.42,66.782917,-33.217083,11.21,13.42,66.782917,-33.217083,11.21,13.58,62.27437,-37.72563,11.21,14.36,46.854053,-53.145947,11.21,14.5,44.860259,-55.139741,11.21,14.5,44.860259,-55.139741,11.21,14.53,44.454898,-55.545102,11.21,14.53,44.454898,-55.545102,0.0,0.25,28.908907,11.54,-1.0,-0.006103,0.004685,0.016018,-6146540023,11.25
5737,2025-01-08,11.48,11.53,11.28,11.42,11.42,39248800,ABEV3.SA,-0.12,12.673244,-1.0,11.21,12.34,129.490559,29.490559,11.21,12.91,86.073155,-13.926845,11.21,13.08,78.248319,-21.751681,11.21,13.19,73.901202,-26.098798,11.21,13.42,66.210111,-33.789889,11.21,13.42,66.210111,-33.789889,11.21,13.58,61.740234,-38.259766,11.21,14.36,46.452179,-53.547821,11.21,14.5,44.475486,-55.524514,11.21,14.5,44.475486,-55.524514,11.21,14.53,44.073603,-55.926397,0.12,0.0,26.790787,11.42,-1.0,-0.007341,0.000115,0.012837,-6185788823,11.02
5738,2025-01-09,11.43,11.51,11.3,11.4,11.4,24470100,ABEV3.SA,-0.02,12.660511,-1.0,11.21,12.34,128.363795,28.363795,11.21,12.34,128.363795,28.363795,11.21,12.91,85.324188,-14.675812,11.21,13.08,77.56744,-22.43256,11.21,13.19,73.25815,-26.74185,11.21,13.42,65.633982,-34.366018,11.21,13.42,65.633982,-34.366018,11.21,13.58,61.203,-38.797,11.21,14.36,46.047975,-53.952025,11.21,14.5,44.088482,-55.911518,11.21,14.5,44.088482,-55.911518,0.02,0.0,26.425065,11.4,-1.0,-0.008395,-0.004483,0.009373,-6210258923,11.05
5739,2025-01-10,11.4,11.43,11.08,11.16,11.16,31277300,ABEV3.SA,-0.24,12.645506,-1.0,11.08,12.24,134.957434,34.957434,11.08,12.34,124.246488,24.246488,11.08,12.34,124.246488,24.246488,11.08,12.91,85.546781,-14.453219,11.08,13.08,78.275302,-21.724698,11.08,13.19,74.194611,-25.805389,11.08,13.42,66.901963,-33.098037,11.08,13.42,66.901963,-33.098037,11.08,13.58,62.620241,-37.379759,11.08,14.36,47.728846,-52.271154,11.08,14.5,45.775029,-54.224971,0.24,0.0,22.28741,11.16,-1.0,-0.009828,-0.009231,0.005653,-6241536223,10.87
5740,2025-01-13,11.18,11.31,11.12,11.24,11.24,28945300,ABEV3.SA,0.08,12.631451,-1.0,11.08,12.19,139.770401,39.770401,11.08,12.24,133.745791,33.745791,11.08,12.34,123.131007,23.131007,11.08,12.34,123.131007,23.131007,11.08,12.91,84.778745,-15.221255,11.08,13.08,77.572549,-22.427451,11.08,13.19,73.528494,-26.471506,11.08,13.42,66.301319,-33.698681,11.08,13.42,66.301319,-33.698681,11.08,13.58,62.058039,-37.941961,11.08,14.36,47.300338,-52.699662,0.0,0.08,26.627587,11.24,-1.0,-0.01092,-0.013967,0.001729,-6270481523,10.91


In [71]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
import winsound

# Definir o número de divisões para a validação cruzada
n_splits = 5

# Criar listas para armazenar os resultados gerais
all_predictions = []
all_verify_values = []
all_actual_values = []
all_accuracy_scores_ytest = []
all_accuracy_scores_verify = []

# Criar uma lista para armazenar as médias das acurácias por t
mean_accuracy_by_t = []

# Loop para cada t em t_list e realização de validação cruzada em série temporal
for t in t_list:
    # Definir as features (X), o target (y) e a verificação (verify) dinamicamente
    X = price_data[['RSI',
                    f'k_percent_{t}',
                    f'r_percent_{t}',
                    'Price_Rate_Of_Change',
                    'MACD',
                    'MACD_EMA',
                    'On Balance Volume']]
    y = price_data['Prediction']
    verify = price_data['Verify']

    # Remover possíveis valores NaN nas colunas selecionadas
    data = pd.concat([X, y, verify], axis=1).dropna()
    X = data.iloc[:, :-2]
    y = data.iloc[:, -2]
    verify = data.iloc[:, -1]

    # Configurar o TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # Listas para armazenar as acurácias por fold para esse n
    accuracy_scores_ytest_fold = []
    accuracy_scores_verify_fold = []

    # Converter para float32
    X = X.astype(np.float32)

    print(f't: {t}')
    fold = 1
    for train_index, test_index in tscv.split(X):
        # Dividir os dados em treino e teste com base nos índices
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        y_verify = verify.iloc[test_index]

        # Criar o modelo de Random Forest
        rand_frst_clf = RandomForestClassifier(n_estimators=100, criterion="gini", random_state=0, oob_score=True)

        # Ajustar o modelo aos dados de treino
        rand_frst_clf.fit(X_train, y_train)

        # Fazer previsões nos dados de teste
        y_pred = rand_frst_clf.predict(X_test)

        # Armazenar as predições e valores reais para análise posterior
        all_predictions.extend(y_pred)
        all_verify_values.extend(y_verify.values)
        all_actual_values.extend(y_test.values)

        # Calcular acurácia para y_test e verify
        accuracy_ytest = accuracy_score(y_test, y_pred) * 100.0
        accuracy_verify = accuracy_score(y_verify, y_pred) * 100.0

        # Armazenar as acurácias por fold
        accuracy_scores_ytest_fold.append(accuracy_ytest)
        accuracy_scores_verify_fold.append(accuracy_verify)

        # Exibir resultados por fold
        print(f'Fold {fold}:')
        print(f' - Correct Prediction (y_test): {accuracy_ytest:.2f}%')
        print(f' - Correct Prediction (Verify): {accuracy_verify:.2f}%')
        fold += 1

    # Calcular a média de acurácia por t
    mean_accuracy_ytest = np.mean(accuracy_scores_ytest_fold)
    mean_accuracy_verify = np.mean(accuracy_scores_verify_fold)
    mean_accuracy_by_t.append((t, mean_accuracy_ytest, mean_accuracy_verify))

    print(f'Média de acurácia (y_test) para t {t}: {mean_accuracy_ytest:.2f}%')
    print(f'Média de acurácia (Verify) para t {t}: {mean_accuracy_verify:.2f}%')
    print('---')

# Exibir o resumo geral das médias
print("Resumo geral das médias:")
for t, acc_ytest, acc_verify in mean_accuracy_by_t:
    print(f't: {t} - Média Acurácia (y_test): {acc_ytest:.2f}% - Média Acurácia (Verify): {acc_verify:.2f}%')

# Opcional: Plotando as médias por t
import matplotlib.pyplot as plt

n_values, acc_ytest_values, acc_verify_values = zip(*mean_accuracy_by_t)

plt.plot(n_values, acc_ytest_values, label='Acurácia (y_test)', marker='o')
plt.plot(n_values, acc_verify_values, label='Acurácia (Verify)', marker='x')
plt.title("Média de Acurácia para Diferentes Valores de t")
plt.xlabel('t')
plt.ylabel('Acurácia (%)')
plt.legend()
plt.grid(True)
plt.show()
winsound.Beep(440, 300)

t: 10
Fold 1:
 - Correct Prediction (y_test): 94.02%
 - Correct Prediction (Verify): 55.82%
Fold 2:
 - Correct Prediction (y_test): 99.58%
 - Correct Prediction (Verify): 61.80%
Fold 3:
 - Correct Prediction (y_test): 86.57%
 - Correct Prediction (Verify): 46.27%
Fold 4:
 - Correct Prediction (y_test): 92.55%
 - Correct Prediction (Verify): 48.79%
Fold 5:
 - Correct Prediction (y_test): 92.13%
 - Correct Prediction (Verify): 47.11%
Média de acurácia (y_test) para t 10: 92.97%
Média de acurácia (Verify) para t 10: 51.96%
---
t: 11
Fold 1:
 - Correct Prediction (y_test): 92.03%
 - Correct Prediction (Verify): 53.83%
Fold 2:
 - Correct Prediction (y_test): 99.58%
 - Correct Prediction (Verify): 62.01%


KeyboardInterrupt: 