# 🎯 Problema de Negócio:

* A Vale S.A. (VALE3) é uma das principais empresas de mineração listadas na B3, com forte impacto no portfólio de investidores institucionais e pessoa física no Brasil. Para um gestor de investimentos ou analista de mercado, antecipar a trajetória do preço de fechamento (Close) e entender os padrões de volume de negociação é crucial para:

    1. Tomar decisões de compra/venda com melhor relação risco-retorno;

    2. Ajustar a alocação de ativos de curto a médio prazo;

    3. Desenvolver estratégias de trading baseadas em sinais estatísticos e de machine learning;

    4. Gerenciar o risco por meio de previsões confiáveis e métricas de incerteza.

* Objetivo Geral: Desenvolver um modelo preditivo de séries temporais para o preço de fechamento diário de VALE3, de modo a gerar forecasts de 7, 15 e 30 a dias futuros e embasar decisões de investimento.

* Importando as bibliotecas necessárias:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBRegressor
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings("ignore")

# 1. Coletando e Tratando os dados.

In [2]:
# Coletando dados da VALE3 e formatando o DataFrame:
vale = yf.download('VALE3.SA', start='2022-01-01', end='2025-06-01', multi_level_index=False)
vale = vale[['Close', 'Open', 'Volume']]
vale.reset_index(inplace=True)
vale['Date'] = pd.to_datetime(vale['Date'])
vale.set_index('Date', inplace=True)

# Criando um DataFrame para armazenar os dados do minério de ferro:
minerio_ferro = pd.read_csv(
    "Dados Históricos - Minério de ferro refinado 62% Fe CFR Futuros.csv"
    )
minerio_ferro.reset_index(inplace=True)
minerio_ferro['Data'] = pd.to_datetime(minerio_ferro['Data'], dayfirst=True)
minerio_ferro.set_index('Data', inplace=True)
minerio_ferro = minerio_ferro[['Último','Abertura', 'Var%']]
# invertendo a ordem das datas:
minerio_ferro = minerio_ferro.iloc[::-1]
minerio_ferro.rename(columns={
    'Último': 'Close',
    'Abertura': 'Open',
    'Var%': 'Variação'
}, inplace=True)

# Unindo os DataFrames de VALE3 e minério de ferro:
df = pd.merge(vale, minerio_ferro, left_index=True, right_index=True, suffixes=('_VALE3', '_Minerio'))
df.rename(columns={
    'Close_VALE3': 'Close_VALE3',
    'Open_VALE3': 'Open_VALE3',
    'Volume': 'Volume_VALE3',
    'Close_Minerio': 'Close_Minerio',
    'Open_Minerio': 'Open_Minerio',
    'Variação': 'Variação_Minerio'
}, inplace=True)

# Variação percentual do preço de fechamento da VALE3:
df['Variação_VALE3'] = df['Close_VALE3'].pct_change() * 100
# Trocando nan por 0%:
df['Variação_VALE3'].fillna(0, inplace=True)

# Vamos transformar as colunas Close_Minerio, Open_Minerio e Variação_Minerio em float:
df['Close_Minerio'] = df['Close_Minerio'].str.replace(',', '.', regex=False).astype(float)
df['Open_Minerio'] = df['Open_Minerio'].str.replace(',', '.', regex=False).astype(float)
df['Variação_Minerio'] = df['Variação_Minerio'].str.replace('%', '', regex=False).str.replace(',', '.', regex=False).astype(float)

# Variação percentual do preço de fechamento do minério de ferro:
df['Variação_Minerio'] = df['Close_Minerio'].pct_change() * 100 
# Trocando nan por 7,02%:
df['Variação_Minerio'].fillna(7.02, inplace=True)

df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Close_VALE3,Open_VALE3,Volume_VALE3,Close_Minerio,Open_Minerio,Variação_Minerio,Variação_VALE3
2022-01-03,59.548790,60.312236,18557200,120.40,120.40,7.020000,0.000000
2022-01-04,58.846432,59.938160,18178700,120.91,120.91,0.423588,-1.179467
2022-01-05,59.403736,59.067824,22039000,124.14,124.14,2.671408,0.947049
2022-01-06,60.602352,60.037404,22044100,125.94,125.94,1.449976,2.017745
2022-01-07,64.129471,61.381066,35213100,126.21,126.21,0.214388,5.820102
...,...,...,...,...,...,...,...
2025-05-23,54.320000,54.000000,14747300,99.81,99.81,-0.080088,0.165960
2025-05-27,53.840000,54.000000,17943000,99.48,99.48,-0.330628,-0.883652
2025-05-28,53.410000,53.740002,17700600,99.39,99.39,-0.090470,-0.798663
2025-05-29,53.450001,53.730000,10246100,99.27,99.27,-0.120736,0.074894


# 2. Adicionando fetures temporais ao nosso DataFrame.

In [3]:
# Vamos adicionar as fetures de dias, semanas, meses, anos e calcular o seno e o cosseno do mês para capturar sazonalidade cíclica:
df['Dia'] = df.index.day
df['Semana'] = df.index.weekday
df['Mês'] = df.index.month
df['Ano'] = df.index.year

df['Sin_Mês'] = np.sin(2 * np.pi * df['Mês'] / 12)
df['Cos_Mês'] = np.cos(2 * np.pi * df['Mês'] / 12)

# Vamos adicionar médias móveis de 20 e 200 dias no preço de fechamento da VALE3 e do minério de ferro:
df['MM_20D_VALE3'] = df['Close_VALE3'].rolling(window=20, min_periods=1).mean()
df['MM_200D_VALE3'] = df['Close_VALE3'].rolling(window=200, min_periods=1).mean()
df['MM_20D_MF'] = df['Close_Minerio'].rolling(window=20, min_periods=1).mean()
df['MM_200D_MF'] = df['Close_Minerio'].rolling(window=200, min_periods=1).mean()

# Coluna do preço anterior(D-1):
df['Close_VALE3_D-1'] = df['Close_VALE3'].shift(1)
df['Close_VALE3_D-1'].fillna(0, inplace=True)

df

Unnamed: 0,Close_VALE3,Open_VALE3,Volume_VALE3,Close_Minerio,Open_Minerio,Variação_Minerio,Variação_VALE3,Dia,Semana,Mês,Ano,Sin_Mês,Cos_Mês,MM_20D_VALE3,MM_200D_VALE3,MM_20D_MF,MM_200D_MF,Close_VALE3_D-1
2022-01-03,59.548790,60.312236,18557200,120.40,120.40,7.020000,0.000000,3,0,1,2022,0.5,0.866025,59.548790,59.548790,120.400000,120.400000,0.000000
2022-01-04,58.846432,59.938160,18178700,120.91,120.91,0.423588,-1.179467,4,1,1,2022,0.5,0.866025,59.197611,59.197611,120.655000,120.655000,59.548790
2022-01-05,59.403736,59.067824,22039000,124.14,124.14,2.671408,0.947049,5,2,1,2022,0.5,0.866025,59.266319,59.266319,121.816667,121.816667,58.846432
2022-01-06,60.602352,60.037404,22044100,125.94,125.94,1.449976,2.017745,6,3,1,2022,0.5,0.866025,59.600327,59.600327,122.847500,122.847500,59.403736
2022-01-07,64.129471,61.381066,35213100,126.21,126.21,0.214388,5.820102,7,4,1,2022,0.5,0.866025,60.506156,60.506156,123.520000,123.520000,60.602352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-23,54.320000,54.000000,14747300,99.81,99.81,-0.080088,0.165960,23,4,5,2025,0.5,-0.866025,54.055500,54.977746,99.505000,101.648300,54.230000
2025-05-27,53.840000,54.000000,17943000,99.48,99.48,-0.330628,-0.883652,27,1,5,2025,0.5,-0.866025,54.055000,54.964091,99.483000,101.611400,54.320000
2025-05-28,53.410000,53.740002,17700600,99.39,99.39,-0.090470,-0.798663,28,2,5,2025,0.5,-0.866025,54.023500,54.947780,99.457000,101.575400,53.840000
2025-05-29,53.450001,53.730000,10246100,99.27,99.27,-0.120736,0.074894,29,3,5,2025,0.5,-0.866025,54.004000,54.937926,99.427500,101.540500,53.410000


# 3. Decomposição da série temporal (tendência, sazonalidade e resíduo).

In [4]:
decomposicao = seasonal_decompose(df['Close_VALE3'], model='multiplicative', period=30)
df['Tendencia'] = decomposicao.trend
df['Sazonalidade'] = decomposicao.seasonal
df['Residuo'] = decomposicao.resid

# Preencher valores ausentes resultantes da decomposição:
df.fillna(method='bfill', inplace=True)
df.fillna(method='ffill', inplace=True)

# 4. Lags e Rolling Statistics.

* Vamos criar colunas que trazem o preço de fechamento e o retorno percentual de ontem, anteontem, etc. Assim o modelo “vê” vários dias anteriores:

In [5]:
lags = [1, 2, 3, 4, 5] # dias de atraso

for lag in lags:
    df[f'Close_Vale_lag{lag}'] = df['Close_VALE3'].shift(lag) # Preço do fechamento lagado
    df[f'Ret_Vale_lag{lag}'] = df['Close_VALE3'].shift(lag) # Retorno percentual lagado

* Agora, para captar tendências e volatilidade de curto e médio prazo, vamos criar médias móveis de curto prazo e estatísticas móveis (desvio, assimetria, curtose) em janelas de vários tamanhos:

In [6]:
windows = [5, 10, 15, 30, 50] # Janelas de dias

for w in windows:
    # Média móvel
    df[f'MM_de_{w}'] = df['Close_VALE3'].rolling(window=w, min_periods=1).mean()
    # Desvio-padrão móvel dos retornos em w dias
    df[f'Desvio_Padrao_de_{w}'] = df['Close_VALE3'].rolling(window=w, min_periods=1).std()
    # Assimetria móvel dos retornos
    df[f'roll_skew_{w}'] = df['Close_VALE3'].rolling(window=w, min_periods=1).skew()
    # Curtose móvel dos retornos
    df[f'Curtose_{w}'] = df['Close_VALE3'].rolling(window=w, min_periods=1).kurt()

In [7]:
# Dropando valores NaN:
df = df.dropna()
df

Unnamed: 0,Close_VALE3,Open_VALE3,Volume_VALE3,Close_Minerio,Open_Minerio,Variação_Minerio,Variação_VALE3,Dia,Semana,Mês,...,roll_skew_15,Curtose_15,MM_de_30,Desvio_Padrao_de_30,roll_skew_30,Curtose_30,MM_de_50,Desvio_Padrao_de_50,roll_skew_50,Curtose_50
2022-01-10,63.366039,62.976680,25056700,124.48,124.48,-1.370731,-1.190454,10,0,1,...,0.758623,-1.666775,60.982803,2.228901,0.758623,-1.666775,60.982803,2.228901,0.758623,-1.666775
2022-01-11,64.572273,63.732484,28418800,126.75,126.75,1.823586,1.903597,11,1,1,...,0.288819,-2.355142,61.495585,2.445529,0.288819,-2.355142,61.495585,2.445529,0.288819,-2.355142
2022-01-12,65.274643,66.030453,27335400,128.66,128.66,1.506903,1.087726,12,2,1,...,0.027979,-2.264999,61.967967,2.628954,0.027979,-2.264999,61.967967,2.628954,0.027979,-2.264999
2022-01-13,64.282173,64.770778,23154200,127.84,127.84,-0.637339,-1.520452,13,3,1,...,-0.244854,-2.134054,62.225101,2.577312,-0.244854,-2.134054,62.225101,2.577312,-0.244854,-2.134054
2022-01-14,64.656250,63.915706,21183400,126.24,126.24,-1.251564,0.581929,14,4,1,...,-0.457192,-1.914945,62.468216,2.548632,-0.457192,-1.914945,62.468216,2.548632,-0.457192,-1.914945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-23,54.320000,54.000000,14747300,99.81,99.81,-0.080088,0.165960,23,4,5,...,-0.324580,-1.637176,53.886667,1.031050,0.026033,-1.077892,54.707800,1.957700,-0.092500,-0.233223
2025-05-27,53.840000,54.000000,17943000,99.48,99.48,-0.330628,-0.883652,27,1,5,...,-0.414413,-1.341707,53.953000,0.956845,0.165655,-1.250139,54.709400,1.956942,-0.094601,-0.228681
2025-05-28,53.410000,53.740002,17700600,99.39,99.39,-0.090470,-0.798663,28,2,5,...,-0.421093,-1.216086,53.974000,0.936919,0.169704,-1.187451,54.687600,1.965375,-0.064842,-0.278782
2025-05-29,53.450001,53.730000,10246100,99.27,99.27,-0.120736,0.074894,29,3,5,...,-0.394918,-1.104977,53.967000,0.940125,0.186887,-1.207078,54.630800,1.959149,0.008174,-0.252883


# 5. Indicadores Técnicos.

* Média Móvel Exponencial:

In [8]:
# Média móvel exponencial de 20 dias:
df['MME_20D'] = df['Close_VALE3'].ewm(span=20, adjust=False).mean()
# Média móvel exponencial de 200 dias:
df['MME_200D'] = df['Close_VALE3'].ewm(span=200, adjust=False).mean()

* RSI (Relative Strength Index):

In [9]:
def calculo_rsi(series, window=14):
    delta = series.diff()

    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    return rsi

df['RSI_14'] = calculo_rsi(df['Close_VALE3'], window=14)
df['RSI_14'].fillna(df['RSI_14'].mean(), inplace=True)
df[['RSI_14']]

Unnamed: 0,RSI_14
2022-01-10,49.403628
2022-01-11,49.403628
2022-01-12,49.403628
2022-01-13,49.403628
2022-01-14,49.403628
...,...
2025-05-23,63.709707
2025-05-27,57.777793
2025-05-28,54.450253
2025-05-29,56.327985


* MACD:

In [10]:
MME_12 = df['Close_VALE3'].ewm(span=12, adjust=False).mean()
MME_26 = df['Close_VALE3'].ewm(span=26, adjust=False).mean()

df['MACD'] = MME_12 - MME_26

df['MACD_Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()

* Bollinger Bands:

In [11]:
# Calculando a média móvel e o desvio padrão
media_20 = df['Close_VALE3'].rolling(window=20, min_periods=1).mean()
desvio_20 = df['Close_VALE3'].rolling(window=20, min_periods=1).std()

# Calculando as bandas
df['Banda_Media'] = media_20
df['Banda_Superior'] = media_20 + (2 * desvio_20)
df['Banda_Inferior'] = media_20 - (2 * desvio_20)

df['Banda_Inferior'].fillna(df['Banda_Inferior'].mean(), inplace=True)
df['Banda_Superior'].fillna(df['Banda_Superior'].mean(), inplace=True)
df[['Banda_Media', 'Banda_Superior', 'Banda_Inferior']]

Unnamed: 0,Banda_Media,Banda_Superior,Banda_Inferior
2022-01-10,63.366039,64.771639,57.143932
2022-01-11,63.969156,65.675029,62.263284
2022-01-12,64.404318,66.334965,62.473672
2022-01-13,64.373782,65.954873,62.792691
2022-01-14,64.430276,65.822654,63.037897
...,...,...,...
2025-05-23,54.055500,56.080205,52.030795
2025-05-27,54.055000,56.079923,52.030077
2025-05-28,54.023500,56.068903,51.978097
2025-05-29,54.004000,56.064152,51.943848


# 6. Volatilidade e Risco.

In [12]:
# Retorno logarítmico diário:
df['LogRet'] = np.log(df['Close_VALE3'] / df['Close_VALE3'].shift(1))

# Volatilidade histórica em janelas móveis:
window = 30
df[f'Volatilidade_{window}'] = df['LogRet'].rolling(window=window, min_periods=1).std() * np.sqrt(252)

# Tratando valores NaN:
df['Volatilidade_30'].fillna(0, inplace=True)
df['LogRet'].fillna(0, inplace=True)

df[['LogRet','Volatilidade_30']]

Unnamed: 0,LogRet,Volatilidade_30
2022-01-10,0.000000,0.000000
2022-01-11,0.018857,0.000000
2022-01-12,0.010819,0.090232
2022-01-13,-0.015321,0.283681
2022-01-14,0.005802,0.231765
...,...,...
2025-05-23,0.001658,0.245252
2025-05-27,-0.008876,0.198358
2025-05-28,-0.008019,0.193722
2025-05-29,0.000749,0.187599


* Rolling Sharpe Ratio:

In [13]:
# taxa livre de risco anual de 3% convertida para diária:
rf_daily = 0.03 / 252
window = 30

# Retorno excedente ao RF:
df['ExcessRet'] = df['LogRet'] - rf_daily

# Rolling mean e rolling std do excesso:
roll_mean_ex = df['ExcessRet'].rolling(window=window, min_periods=1).mean()
roll_std_ex = df['ExcessRet'].rolling(window=window, min_periods=1).std()

# Rolling Sharpe:
df[f'Sharpe_{window}d'] = roll_mean_ex / roll_std_ex

df['Sharpe_30d'].fillna(0, inplace=True)

df[['ExcessRet', 'Sharpe_30d']]

Unnamed: 0,ExcessRet,Sharpe_30d
2022-01-10,-0.000119,0.000000
2022-01-11,0.018738,0.698179
2022-01-12,0.010699,1.032781
2022-01-13,-0.015440,0.234652
2022-01-14,0.005683,0.304622
...,...,...
2025-05-23,0.001539,0.205892
2025-05-27,-0.008995,0.090941
2025-05-28,-0.008138,0.022656
2025-05-29,0.000630,-0.021134


# 7. Interações e Razões.

* 7.1. Razão entre os preços de fechamento:

In [14]:
df['Ratio_VALE3_Minerio'] = df['Close_VALE3'] / df['Close_Minerio']

* 7.2. Diferença absoluta entre os preços de fechamento:

In [15]:
df['Diff_VALE3_Minerio'] = df['Close_VALE3'] - df['Close_Minerio']

* 7.3. Diferença entre as variações percentuais:

In [16]:
df['Diff_Retorno_VALE3_Minerio'] = df['Variação_VALE3'] - df['Variação_Minerio']

# 8. Pré-seleção e Redução de Dimensionalidade.

* Esse passo garante que iremos manter apenas as variáveis que agregam valor preditivo ao modelo, evitando:

    1. Multicolinearidade entre variáveis;

    2. Variáveis irrelevantes;

    3. Overfitting.

In [None]:
# Primeiro criamos uma variável-alvo, nesse caso vamos pegar o preço do fechamento do dia seguinte:
df['Target_Close_D+1'] = df['Close_VALE3'].shift(-1)
df['Target_Close_D+1'].fillna(0, inplace=True)

# Agora vamos calcular a correlação de todas as colunas númericas com essa target:
correlacoes = df.corr(numeric_only=True)['Target_Close_D+1'].sort_values(ascending=False)

# Seleciona só variáveis numéricas:
X_numerico = df.select_dtypes(include=[np.number]).drop(columns=['Target_Close_D+1'])

# Aplica filtro de variância com um limiar suave:
selector = VarianceThreshold(threshold=1e-5)
selector.fit(X_numerico)
x_filtrado_var = X_numerico.loc[:, selector.get_support()]

# Matriz de correlação:
corr_matriz = x_filtrado_var.corr().abs()

# Triângulo superior:
upper = corr_matriz.where(np.triu(np.ones(corr_matriz.shape), k=1).astype(bool))

# Identifica colunas muito correlacionadas (threshold de 0.95 para maior tolerância:
colunas_a_remover = [col for col in upper.columns if any(upper[col] > 0.95)]

# Remove do conjunto:
x_filtrado_corr = x_filtrado_var.drop(columns=colunas_a_remover)

# Junta com a target temporariamente
x_filtrado_corr['Target_Close_D+1'] = df['Target_Close_D+1']

# Calcula a correlação com a variável alvo
correlacoes = x_filtrado_corr.corr()['Target_Close_D+1'].abs().sort_values(ascending=False)

# Seleciona top 25 variáveis mais correlacionadas com o target
top_features = correlacoes.drop('Target_Close_D+1').head(25).index.to_list()

# Conjunto final
df_final = df[top_features + ['Target_Close_D+1']]

# Tratando um valor ausente:
df_final.fillna(0, inplace=True)

df_final

Unnamed: 0,Close_VALE3,MM_20D_VALE3,MME_200D,Close_Minerio,Desvio_Padrao_de_50,Ano,MM_200D_VALE3,MM_20D_MF,MACD_Signal,MACD,...,Volatilidade_30,Sin_Mês,Desvio_Padrao_de_5,RSI_14,Cos_Mês,Diff_VALE3_Minerio,Ratio_VALE3_Minerio,Mês,Variação_Minerio,Target_Close_D+1
2022-01-10,63.366039,60.982803,63.366039,124.48,2.228901,2022,60.982803,123.680000,0.000000,0.000000,...,0.000000,0.5,2.364970,49.403628,0.866025,-61.113961,0.509046,1,-1.370731,64.572273
2022-01-11,64.572273,61.495585,63.378042,126.75,2.445529,2022,61.495585,124.118571,0.019245,0.096224,...,0.000000,0.5,2.283151,49.403628,0.866025,-62.177727,0.509446,1,1.823586,65.274643
2022-01-12,65.274643,61.967967,63.396913,128.66,2.628954,2022,61.967967,124.686250,0.060705,0.226546,...,0.090232,0.5,1.807630,49.403628,0.866025,-63.385357,0.507342,1,1.506903,64.282173
2022-01-13,64.282173,62.225101,63.405722,127.84,2.577312,2022,62.225101,125.036667,0.097943,0.246897,...,0.283681,0.5,0.693296,49.403628,0.866025,-63.557827,0.502833,1,-0.637339,64.656250
2022-01-14,64.656250,62.468216,63.418165,126.24,2.548632,2022,62.468216,125.157000,0.136328,0.289868,...,0.231765,0.5,0.696189,49.403628,0.866025,-61.583750,0.512169,1,-1.251564,65.893036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-23,54.320000,54.055500,55.134049,99.81,1.957700,2025,54.977746,99.505000,0.121362,0.198816,...,0.245252,0.5,0.541322,63.709707,-0.866025,-45.490000,0.544234,5,-0.080088,53.840000
2025-05-27,53.840000,54.055000,55.121173,99.48,1.956942,2025,54.964091,99.483000,0.123133,0.130217,...,0.198358,0.5,0.565711,57.777793,-0.866025,-45.640000,0.541214,5,-0.330628,53.410000
2025-05-28,53.410000,54.023500,55.104146,99.39,1.965375,2025,54.947780,99.457000,0.106644,0.040685,...,0.193722,0.5,0.474310,54.450253,-0.866025,-45.980000,0.537378,5,-0.090470,53.450001
2025-05-29,53.450001,54.004000,55.087687,99.27,1.959149,2025,54.937926,99.427500,0.079968,-0.026733,...,0.187599,0.5,0.423969,56.327985,-0.866025,-45.819999,0.538431,5,-0.120736,52.099998
