In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tqdm import tqdm
plt.style.use('ggplot')

In [2]:
class CONFIG:

    NAMES_DTYPES = {
        "Source" : str,
        "Production" : np.float32
    }

current_directory = os.getcwd()
parent_folder_path = os.path.abspath(os.path.join(current_directory, "."))
data_folder_path = os.path.join(parent_folder_path, "data")
file_path = os.path.join(data_folder_path, "intermittent-renewables-production-france.csv")
data = pd.read_csv(
    file_path,
    index_col="Date and Hour",
    parse_dates=["Date and Hour", "Date"],
    dtype=CONFIG.NAMES_DTYPES
)
data.shape

(59806, 8)

In [3]:
data.head()

Unnamed: 0_level_0,Date,StartHour,EndHour,Source,Production,dayOfYear,dayName,monthName
Date and Hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-07-22 20:00:00+02:00,2020-07-22,20:00:00,21:00:00,Solar,244.0,204,Wednesday,July
2020-07-23 07:00:00+02:00,2020-07-23,07:00:00,08:00:00,Solar,223.0,205,Thursday,July
2020-07-23 16:00:00+02:00,2020-07-23,16:00:00,17:00:00,Solar,2517.0,205,Thursday,July
2020-07-23 19:00:00+02:00,2020-07-23,19:00:00,20:00:00,Solar,658.0,205,Thursday,July
2020-07-23 23:00:00+02:00,2020-07-23,23:00:00,24:00:00,Solar,0.0,205,Thursday,July


In [4]:
# Verificando valores ausentes:
missing_values = data.isnull().sum()

# Mostrar a contagem de valores ausentes por coluna:
print(missing_values)

Date          0
StartHour     0
EndHour       0
Source        0
Production    2
dayOfYear     0
dayName       0
monthName     0
dtype: int64


In [5]:
# Preenchendo os valores ausentes da coluna "Production" com a média:
mean_production = data["Production"].mean()
data["Production"].fillna(mean_production, inplace=True)
print('Valor médio da coluna Production:', mean_production)
print('Valores ausentes preenchidos com valor médio.')

Valor médio da coluna Production: 2792.8596
Valores ausentes preenchidos com valor médio.


In [7]:
day_name_verifier = data['dayName'].unique()
print(day_name_verifier)

['Wednesday' 'Thursday' 'Friday' 'Saturday' 'Sunday' 'Monday' 'Tuesday']


In [8]:
source_verifier = data['Source'].unique()
print(source_verifier)

['Solar' 'Wind']


In [10]:
for column in data.columns:
    unique_values = data[column].unique()
    print(f"Valores únicos da coluna '{column}': {unique_values}")

Valores únicos da coluna 'Date': <DatetimeArray>
['2020-07-22 00:00:00', '2020-07-23 00:00:00', '2020-07-24 00:00:00',
 '2020-07-25 00:00:00', '2020-07-26 00:00:00', '2020-07-27 00:00:00',
 '2020-07-28 00:00:00', '2020-07-29 00:00:00', '2020-07-30 00:00:00',
 '2020-07-31 00:00:00',
 ...
 '2023-06-21 00:00:00', '2023-06-22 00:00:00', '2023-06-23 00:00:00',
 '2023-06-24 00:00:00', '2023-06-25 00:00:00', '2023-06-26 00:00:00',
 '2023-06-27 00:00:00', '2023-06-28 00:00:00', '2023-06-29 00:00:00',
 '2023-06-30 00:00:00']
Length: 1246, dtype: datetime64[ns]
Valores únicos da coluna 'StartHour': ['20:00:00' '07:00:00' '16:00:00' '19:00:00' '23:00:00' '01:00:00'
 '04:00:00' '05:00:00' '10:00:00' '14:00:00' '06:00:00' '08:00:00'
 '21:00:00' '03:00:00' '18:00:00' '22:00:00' '00:00:00' '09:00:00'
 '13:00:00' '11:00:00' '15:00:00' '02:00:00' '17:00:00' '12:00:00']
Valores únicos da coluna 'EndHour': ['21:00:00' '08:00:00' '17:00:00' '20:00:00' '24:00:00' '02:00:00'
 '05:00:00' '06:00:00' '11:00:00

In [14]:
# Exportar DataFrame tratado para CSV
output_folder_path = os.path.dirname(file_path)
novo_nome_arquivo_csv = "tratado.csv"
data.to_csv(os.path.join(output_folder_path, novo_nome_arquivo_csv), index=False)