In [71]:
'''
Objetivo é treinar um modelo de regressão linear para fazer previsões de preços das empresa Sony, Microsoft, Ubisoft, Nintendo, Eletronic Arts

'''

'\nObjetivo é treinar um modelo de regressão linear para fazer previsões de preços das empresa Sony, Microsoft, Ubisoft, Nintendo, Eletronic Arts\n\n'

In [72]:
#Bibliotecas utilizadas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA
import datetime as dt

In [73]:
#Carregamento dos dados

dados =  pd.read_excel(r"C:\Users\dante\OneDrive\Documentos\9. Meus Projetos\Projetos Power Bi\Mercado de Ações\Dados\Agrupado.xlsx")

df = pd.DataFrame(dados)

In [74]:
#Visão Geral

print(df.shape)
print(df.columns)
print(df.head())
df.info()

#A coluna 'Date' será a variavel preditora
#As colunas 'Close/Last', 'Volume', 'Open', 'High', 'Low' serão as colunas alvos

(6290, 7)
Index(['Empresa', 'Date', 'Close/Last', 'Volume', 'Open', 'High', 'Low'], dtype='object')
          Empresa                 Date Close/Last   Volume     Open      High  \
0  Eletronic Arts  2024-04-11 00:00:00    $152.89  1975349  $152.16   $153.09   
1  Eletronic Arts  2024-01-11 00:00:00    $151.26  1954157  $150.83   $152.46   
2  Eletronic Arts           10/31/2024    $150.85  2645877  $148.31  $151.585   
3  Eletronic Arts           10/30/2024    $149.14  3600584  $147.48   $151.34   
4  Eletronic Arts           10/29/2024    $145.62  2703361  $144.14   $146.26   

        Low  
0   $150.40  
1   $149.14  
2   $147.59  
3   $144.68  
4  $144.095  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6290 entries, 0 to 6289
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Empresa     6290 non-null   object
 1   Date        6290 non-null   object
 2   Close/Last  6290 non-null   object
 3   Volume      6290 n

In [75]:
#Salvando uma cópia do dataset

df_original = df.copy()

In [76]:
#Conversão de objeto para data
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format=None)
#conferindo as informações
print(df['Date'].dtypes)
print(df['Date'].head())



datetime64[ns]
0   2024-04-11
1   2024-01-11
2   2024-10-31
3   2024-10-30
4   2024-10-29
Name: Date, dtype: datetime64[ns]


In [77]:
#tipos das colunas

tipos = df.dtypes
print(tipos)

Empresa               object
Date          datetime64[ns]
Close/Last            object
Volume                 int64
Open                  object
High                  object
Low                   object
dtype: object


In [78]:
#Análise Exploratória - Resumo Estatístico

valores_ausentes = df.isnull().sum()
print(valores_ausentes)


Empresa       0
Date          0
Close/Last    0
Volume        0
Open          0
High          0
Low           0
dtype: int64


In [79]:
df['Open'] = df['Open'].str.replace("$", '')
df['Close/Last'] = df['Close/Last'].str.replace("$", '')
df['High'] = df['High'].str.replace("$", '')
df['Low'] = df['Low'].str.replace("$", '')

print(df['Open'].head())
print(df['Close/Last'].head())
print(df['High'].head())
print(df['Low'].head())

0    152.16
1    150.83
2    148.31
3    147.48
4    144.14
Name: Open, dtype: object
0    152.89
1    151.26
2    150.85
3    149.14
4    145.62
Name: Close/Last, dtype: object
0     153.09
1     152.46
2    151.585
3     151.34
4     146.26
Name: High, dtype: object
0     150.40
1     149.14
2     147.59
3     144.68
4    144.095
Name: Low, dtype: object


In [80]:
df['Open'] = df['Open'].astype(float)
df['Close/Last'] = df['Close/Last'].astype(float)
df['High'] = df['High'].astype(float)
df['Low'] = df['Low'].astype(float)
df['Volume'] = df['Volume'].astype(float)
print(df.dtypes)

Empresa               object
Date          datetime64[ns]
Close/Last           float64
Volume               float64
Open                 float64
High                 float64
Low                  float64
dtype: object


In [81]:
descricao = df.describe()
print(descricao)

                                Date   Close/Last        Volume         Open  \
count                           6290  6290.000000  6.290000e+03  6290.000000   
mean   2022-05-06 14:23:05.055644160    90.937978  7.425999e+06    90.925466   
min              2019-02-12 00:00:00     2.148000  9.050000e+02     2.050000   
25%              2021-02-03 00:00:00    12.185500  7.994190e+05    12.190000   
50%              2022-05-07 12:00:00    18.043000  2.295472e+06    18.081000   
75%              2023-08-08 00:00:00   138.830000  5.590171e+06   138.880000   
max              2024-12-09 00:00:00   467.560000  9.707356e+07   467.000000   
std                              NaN   113.290179  1.217958e+07   113.289491   

              High          Low  
count  6290.000000  6290.000000  
mean     91.827667    89.988732  
min       2.180000     2.030000  
25%      12.285500    12.093500  
50%      18.178000    17.933300  
75%     140.432500   137.407500  
max     468.350000   464.460000  
std    

In [82]:
print(df['Date'].head())

0   2024-04-11
1   2024-01-11
2   2024-10-31
3   2024-10-30
4   2024-10-29
Name: Date, dtype: datetime64[ns]


In [83]:
#preciso separar a data para criar uma variável preditora para o modelo

df['Ano'] = df['Date'].dt.year
print(df['Ano'].head())
df['Mes'] = df['Date'].dt.month
print(df['Mes'].head())
df['Dia'] = df['Date'].dt.day
print(df['Dia'].head())

0    2024
1    2024
2    2024
3    2024
4    2024
Name: Ano, dtype: int32
0     4
1     1
2    10
3    10
4    10
Name: Mes, dtype: int32
0    11
1    11
2    31
3    30
4    29
Name: Dia, dtype: int32


In [84]:
#verificando primeiro valor da coluna 'Date'

primeiro_linha = df.sort_values('Date').iloc[0]
print(primeiro_linha)

Empresa       Nintendo Corporation
Date           2019-02-12 00:00:00
Close/Last                  10.008
Volume                   1959825.0
Open                         9.876
High                         10.16
Low                          9.876
Ano                           2019
Mes                              2
Dia                             12
Name: 3755, dtype: object


In [85]:
#transformando em bidimensional
'''
x = np.array(df['Dia'])

x = x.reshape(-1,1)

'''

"\nx = np.array(df['Dia'])\n\nx = x.reshape(-1,1)\n\n"

In [86]:
df['Dia'] = pd.to_datetime(df['Dia'])

# Data inicial para cálculo da distância
data_inicial = pd.to_datetime('2019-02-12')

# Calcular a distância em dias
df['distancia_em_dias'] = (df['Dia'] - data_inicial).dt.days

print(df['Dia'].dtype)

datetime64[ns]


In [87]:
#transformando a coluna 'Empresa' do tipo string
df['Empresa'] = df['Empresa'].astype(str)