# 1.0 Import

In [1]:
#Load and edit dataframe
import pandas as pd
import warnings 
warnings.filterwarnings( 'ignore' ) 

# Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Select Variables
from sklearn.feature_selection import SelectKBest

# ML - Algoritms
from sklearn.model_selection   import GridSearchCV
from sklearn.neural_network    import MLPRegressor
from sklearn.preprocessing     import MinMaxScaler
from sklearn                   import datasets, linear_model

# Metric
from sklearn.metrics           import mean_squared_error, r2_score


## Load DataSet

In [2]:
df = pd.read_csv('../data/b3_bovespa.csv')

In [3]:
df1 = df.copy()

# Select only one share to predict

In [4]:
# Filter share_data
df_itau = df1[df1['sigla_acao']=='ITUB4']

# Data transformation

In [5]:
df_itau['data_pregao'] = pd.to_datetime(df_itau['data_pregao'], format='%Y-%m-%d')

# Feature Engirennering

In [6]:
# Average moving 
df_itau['avg_5d'] = df_itau['preco_fechamento'].rolling(5).mean()
#.rolling method is used to provide rolling window calculations and math operations.
df_itau['avg_20d'] = df_itau['preco_fechamento'].rolling(20).mean()

In [7]:
df_itau

Unnamed: 0,data_pregao,codbdi,sigla_acao,nome_acao,preco_abertura,preco_maximo,preco_minimo,preco_fechamento,qtd_negocios,volume_negocios,avg_5d,avg_20d
197,2019-01-02,2.0,ITUB4,ITAUUNIBANCO,35.44,37.14,35.35,37.00,25581700,9,,
553,2019-01-03,2.0,ITUB4,ITAUUNIBANCO,36.75,37.61,36.45,37.61,21938600,8,,
901,2019-01-04,2.0,ITUB4,ITAUUNIBANCO,37.22,37.71,36.75,36.98,24873500,9,,
1250,2019-01-07,2.0,ITUB4,ITAUUNIBANCO,36.80,37.50,36.75,37.05,16901100,6,,
1617,2019-01-08,2.0,ITUB4,ITAUUNIBANCO,37.18,37.57,36.56,37.52,19556300,7,37.232,
...,...,...,...,...,...,...,...,...,...,...,...,...
360359,2021-12-23,2.0,ITUB4,ITAUUNIBANCO,21.33,21.57,21.17,21.34,22377000,4,21.258,22.1315
360360,2021-12-27,2.0,ITUB4,ITAUUNIBANCO,21.48,21.70,21.31,21.56,14025800,3,21.280,22.0690
360361,2021-12-28,2.0,ITUB4,ITAUUNIBANCO,21.63,21.70,21.40,21.48,15294200,3,21.352,22.0100
360362,2021-12-29,2.0,ITUB4,ITAUUNIBANCO,21.53,21.58,21.23,21.30,12856100,2,21.382,21.9560


In [8]:
# Pushing action values to validate prediction 
df_itau['preco_fechamento'] = df_itau['preco_fechamento'].shift(-1)
"""The .shift() method is used to shift the DataFrame index by a 
specified number of periods with an optional time frequency."""

'The .shift() method is used to shift the DataFrame index by a \nspecified number of periods with an optional time frequency.'

## Clean DataFrame dropna()

In [9]:
# DropNAN
df_itau.dropna(inplace=True)

In [10]:
df_itau.head()

Unnamed: 0,data_pregao,codbdi,sigla_acao,nome_acao,preco_abertura,preco_maximo,preco_minimo,preco_fechamento,qtd_negocios,volume_negocios,avg_5d,avg_20d
6945,2019-01-30,2.0,ITUB4,ITAUUNIBANCO,38.22,38.41,37.4,38.78,19386900,7,37.838,37.535
7305,2019-01-31,2.0,ITUB4,ITAUUNIBANCO,38.46,39.39,38.37,38.81,26940800,10,38.086,37.624
7657,2019-02-01,2.0,ITUB4,ITAUUNIBANCO,38.6,39.1,38.34,39.69,13541300,5,38.35,37.684
8032,2019-02-04,2.0,ITUB4,ITAUUNIBANCO,38.67,39.79,38.4,38.0,17686500,6,38.656,37.8195
8388,2019-02-05,2.0,ITUB4,ITAUUNIBANCO,38.56,38.8,37.62,36.4,71190500,27,38.68,37.867


Assumption:
At this point, I cleaned the dataframe by removing all null values, with that we lost 20 lines of the dataframe which does not offer any significant loss of the set.

In [13]:
# df_itau.info()
# Original values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 744 entries, 197 to 360363
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   data_pregao       744 non-null    datetime64[ns]
 1   sigla_acao        744 non-null    object        
 2   nome_acao         744 non-null    object        
 3   preco_abertura    744 non-null    float64       
 4   preco_maximo      744 non-null    float64       
 5   preco_minimo      744 non-null    float64       
 6   preco_fechamento  744 non-null    float64       
 7   qtd_negocios      744 non-null    int64         
 8   volume_negocios   744 non-null    int64         
dtypes: datetime64[ns](1), float64(4), int64(2), object(2)
memory usage: 58.1+ KB


In [11]:
df_itau.info()
# After cleaned

<class 'pandas.core.frame.DataFrame'>
Int64Index: 724 entries, 6945 to 360362
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   data_pregao       724 non-null    datetime64[ns]
 1   codbdi            724 non-null    float64       
 2   sigla_acao        724 non-null    object        
 3   nome_acao         724 non-null    object        
 4   preco_abertura    724 non-null    float64       
 5   preco_maximo      724 non-null    float64       
 6   preco_minimo      724 non-null    float64       
 7   preco_fechamento  724 non-null    float64       
 8   qtd_negocios      724 non-null    int64         
 9   volume_negocios   724 non-null    int64         
 10  avg_5d            724 non-null    float64       
 11  avg_20d           724 non-null    float64       
dtypes: datetime64[ns](1), float64(7), int64(2), object(2)
memory usage: 73.5+ KB


In [12]:
qntt_rows = len(df_itau)
r_train = qntt_rows -700 # 70% to train
r_test = qntt_rows -15 # 15% to test

r_validation = r_train - r_test  # 15% to validation

info = (
    f'Train = 0:{r_train} | '
    f'Test = {r_train}:{r_test} | '
    f'Validation = {r_test}:{r_train}')
info

'Train = 0:24 - Test = 24:709 - Validation = 709:24'

# Feature Selection

In [20]:

feature = df_itau.drop(['data_pregao', 'sigla_acao', 'nome_acao','preco_fechamento'], axis=1)
target = df_itau['preco_fechamento']

In [23]:
#Escolhendo as melhores features com Kbest
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

features_list = ('preco_abertura', 'qtd_negocios','volume_negocios','avg_5d', 'avg_20d')

k_best_features = SelectKBest(k='all')
k_best_features.fit_transform(feature, target)
k_best_features_scores = k_best_features.scores_
raw_pairs = zip(features_list[1:], k_best_features_scores)
ordered_pairs = list(reversed(sorted(raw_pairs, key=lambda x: x[1])))

k_best_features_final = dict(ordered_pairs[:15])
best_features = k_best_features_final.keys()
print ('')
print ("Best features:")
print (k_best_features_final)


Best features:
{'avg_20d': 17.545461230925657, 'avg_5d': 16.37596508807449, 'volume_negocios': 15.855218975489667, 'qtd_negocios': nan}


In [21]:
df_itau.columns

Index(['data_pregao', 'codbdi', 'sigla_acao', 'nome_acao', 'preco_abertura',
       'preco_maximo', 'preco_minimo', 'preco_fechamento', 'qtd_negocios',
       'volume_negocios', 'avg_5d', 'avg_20d'],
      dtype='object')

In [None]:
https://www.youtube.com/watch?v=VhjQwahg8MY
    
https://github.com/fabrimatt/machine_learnig/blob/master/Previs%C3%A3o%20pre%C3%A7o%20a%C3%A7%C3%B5es.ipynb