# Passo 1: Importar os dados (já tratados)

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tqdm import tqdm
plt.style.use('ggplot')

In [6]:
class CONFIG:

    NAMES_DTYPES = {
        "Source" : str,
        "Production" : np.float32
    }

current_directory = os.getcwd()
parent_folder_path = os.path.abspath(os.path.join(current_directory, "."))
data_folder_path = os.path.join(parent_folder_path, "data")
file_path = os.path.join(data_folder_path, "tratado.csv")
data = pd.read_csv(
    file_path
)
data.shape

(59806, 8)

In [None]:
data.head()

# Passo 2: Criar uma cópia DataFrame e adicionar algumas colunas para facilitar a previsão da produção diária

In [17]:
# Criando a cópia do DF
data_daily = data.copy()

In [18]:
# Para evitar erros, transformamos a coluna 'Date' para o tipo de dados de data 
data_daily['Date'] = pd.to_datetime(data_daily['Date'])

In [19]:
# Somando as produções diárias para cada fonte ('Solar' e 'Wind')
data_daily = data_daily.groupby(['Date', 'Source']).agg({'Production': 'sum'}).reset_index()

In [20]:
# Criando as colunas 'dayName', 'monthName' e 'dayOfYear'
data_daily['dayName'] = data_daily['Date'].dt.strftime('%A')
data_daily['monthName'] = data_daily['Date'].dt.strftime('%B')
data_daily['dayOfYear'] = data_daily['Date'].dt.dayofyear

In [21]:
# Exibindo o DataFrame 'data_daily' após a preparação
print(data_daily)

           Date Source  Production    dayName monthName  dayOfYear
0    2020-01-01  Solar      6821.0  Wednesday   January          1
1    2020-01-01   Wind     40432.0  Wednesday   January          1
2    2020-01-02  Solar      6349.0   Thursday   January          2
3    2020-01-02   Wind     82888.0   Thursday   January          2
4    2020-01-03  Solar      5036.0     Friday   January          3
...         ...    ...         ...        ...       ...        ...
2487 2023-06-28   Wind     32186.0  Wednesday      June        179
2488 2023-06-29  Solar     44728.0   Thursday      June        180
2489 2023-06-29   Wind     43038.0   Thursday      June        180
2490 2023-06-30  Solar     40623.0     Friday      June        181
2491 2023-06-30   Wind     90244.0     Friday      June        181

[2492 rows x 6 columns]


# Passo 3: Treinamento do modelo

### Random Forest

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [32]:
# Extrair componentes da data e converter 'Source' em valores numéricos
data_daily['Year'] = data_daily['Date'].dt.year
data_daily['Month'] = data_daily['Date'].dt.month
data_daily['Day'] = data_daily['Date'].dt.day
data_daily['Source_Num'] = data_daily['Source'].map({'Solar': 0, 'Wind': 1})

In [33]:
# Separar as features (variáveis independentes) e o target (variável dependente)
X = data_daily.drop(columns=['Production'])
y = data_daily['Production']

In [34]:
# Dividir o conjunto de dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Criar e treinar o modelo Random Forest
random_forest = RandomForestRegressor(random_state=42)
random_forest.fit(X_train, y_train)

TypeError: float() argument must be a string or a real number, not 'Timestamp'