# 0.0 - Imports

In [1]:
# import math
# import pickle
# import optuna
# import itertools

# import numpy                                 as np
# import pandas                                as pd
# import seaborn                               as sns
# import matplotlib.pyplot                     as plt
# import plotly.express                        as px
     
# from datetime                                import datetime
# from prophet                                 import Prophet
# from pandas.api.types                        import CategoricalDtype
# from prophet                                 import Prophet
# from prophet.plot                            import plot_plotly, plot_components_plotly, plot_cross_validation_metric
# from sklearn.metrics                         import mean_squared_error, mean_absolute_error
# from sklearn.impute                          import SimpleImputer
# from prophet.diagnostics                     import cross_validation, performance_metrics
# from sklearn.preprocessing                   import MinMaxScaler
# from xgboost                                 import XGBRegressor, plot_importance
# from sklearn.metrics                         import make_scorer, mean_squared_error, mean_absolute_error
# from sklearn.model_selection                 import cross_validate
# from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
# from keras.models                            import Sequential
# from keras.layers                            import GRU, Dense, Dropout
# from tensorflow.keras.callbacks              import EarlyStopping





In [2]:
import warnings

import numpy                    as np
import pandas                   as pd
import seaborn                  as sns
import matplotlib.pyplot        as plt

from IPython.display            import display, HTML
from datetime                   import datetime
from sklearn.impute             import SimpleImputer
from sklearn.preprocessing      import MinMaxScaler
from tensorflow.keras.models    import Sequential
from tensorflow.keras.layers    import GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics            import mean_squared_error, mean_absolute_error
from pandas.plotting            import register_matplotlib_converters
from statsmodels.tsa.seasonal   import seasonal_decompose

# 0.1 - Helper Functions

In [3]:
# Set the width of the container
display(HTML("<style>.container { width:100% !important; }</style>"))

# Adjust the output area for better display without causing overlap
display(HTML("""
<style>
.output {
    display: block;
    width: 100% !important;
    max-height: 1000px !important;
    overflow-y: auto;
}
.output_area {
    width: 100% !important;
    max-height: 1000px !important;
}
</style>
"""))


warnings.filterwarnings("ignore")

plt.style.use('ggplot')
plt.style.use('fivethirtyeight')


def mean_absolute_percentage_error(y_true, y_pred):
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


weekday_map = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

def create_features(df):
    df = df.copy()    
    df['hour'] = df['Data_Hora'].dt.hour.astype('int64')
    df['dayofweek'] = df['Data_Hora'].dt.dayofweek.astype('int64')
    df['quarter'] = df['Data_Hora'].dt.quarter.astype('int64')
    df['month'] = df['Data_Hora'].dt.month.astype('int64')
    df['year'] = df['Data_Hora'].dt.year.astype('int64')
    df['dayofyear'] = df['Data_Hora'].dt.dayofyear.astype('int64')
    df['dayofmonth'] = df['Data_Hora'].dt.day.astype('int64')
    df['weekofyear'] = df['Data_Hora'].dt.isocalendar().week.astype('int64')

    df['season'] = 'Winter'  # Default to Winter
    df.loc[(df['Data_Hora'] >= df.apply(lambda row: pd.Timestamp(year=row['year'], month=3, day=20), axis=1)) & (df['Data_Hora'] < df.apply(lambda row: pd.Timestamp(year=row['year'], month=6, day=21), axis=1)), 'season'] = 'Spring'
    df.loc[(df['Data_Hora'] >= df.apply(lambda row: pd.Timestamp(year=row['year'], month=6, day=21), axis=1)) & (df['Data_Hora'] < df.apply(lambda row: pd.Timestamp(year=row['year'], month=9, day=23), axis=1)), 'season'] = 'Summer'
    df.loc[(df['Data_Hora'] >= df.apply(lambda row: pd.Timestamp(year=row['year'], month=9, day=23), axis=1)) & (df['Data_Hora'] < df.apply(lambda row: pd.Timestamp(year=row['year'], month=12, day=22), axis=1)), 'season'] = 'Fall'
    df.loc[(df['Data_Hora'] >= df.apply(lambda row: pd.Timestamp(year=row['year'], month=12, day=22), axis=1)) | (df['Data_Hora'] < df.apply(lambda row: pd.Timestamp(year=row['year'], month=3, day=20), axis=1)), 'season'] = 'Winter'
    
    season_map = {'Winter': 4, 'Spring': 1, 'Summer': 2, 'Fall': 3}
    df['season'] = df['season'].map(season_map).astype('int64')
    
    return df

# 0.1 - Data

In [4]:
dfs = []

for i in range(2002, 2019):
    file = rf"C:\Users\gfurt\Ciência de dados\Séries temporais\inmet_prophet\Dados\INMET_SE_RJ_A601_ECOLOGIA AGRICOLA_01-01-{i}_A_31-12-{i}.csv"
    df = pd.read_csv(file, encoding='ISO-8859-1', delimiter=';', skiprows=8)
    dfs.append(df)
    
df_raw_1 = pd.concat(dfs, ignore_index=True)

In [5]:
cols = df_raw_1.columns
dfs = []

for i in range(2019, 2024):
    file = rf"C:\Users\gfurt\Ciência de dados\Séries temporais\inmet_prophet\Dados\INMET_SE_RJ_A601_SEROPEDICA-ECOLOGIA AGRICOLA_01-01-{i}_A_31-12-{i}.csv"
    df = pd.read_csv(file, encoding='ISO-8859-1', delimiter=';', skiprows=9,
                     names=cols)
    dfs.append(df)
    
df_raw_2 = pd.concat(dfs, ignore_index=True)

In [6]:
df_raw_2['HORA (UTC)'] = df_raw_2['HORA (UTC)'].str[:4]
df_raw_2['HORA (UTC)'] = df_raw_2['HORA (UTC)'].apply(lambda x: datetime.strptime(x, '%H%M').strftime('%H:%M'))

In [7]:
# Convertendo a coluna 'DATA (YYYY-MM-DD)' em df_raw_1 para datetime
df_raw_1['DATA (YYYY-MM-DD)'] = pd.to_datetime(df_raw_1['DATA (YYYY-MM-DD)'])

# Convertendo a coluna 'DATA (YYYY-MM-DD)' em df_raw_2 para datetime
df_raw_2['DATA (YYYY-MM-DD)'] = pd.to_datetime(df_raw_2['DATA (YYYY-MM-DD)'])

In [8]:
df_raw = pd.concat([df_raw_1, df_raw_2], ignore_index=True)

# 1.0 - Data description

In [9]:
df1 = df_raw.copy()

## 1.1 - Data dimensions

In [10]:
df1.shape

(192840, 20)

## 1.2 - Data Type

In [11]:
df1.dtypes

DATA (YYYY-MM-DD)                                        datetime64[ns]
HORA (UTC)                                                       object
PRECIPITAÇÃO TOTAL, HORÁRIO (mm)                                 object
PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)            object
PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB)                  object
PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB)                 object
RADIACAO GLOBAL (KJ/m²)                                          object
TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)                     object
TEMPERATURA DO PONTO DE ORVALHO (°C)                             object
TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)                       object
TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)                       object
TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)                 object
TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)                 object
UMIDADE REL. MAX. NA HORA ANT. (AUT) (%)                        

In [12]:
columns = df1.columns[2:-1]

for i in columns:
    df1[i] = df1[i].astype(str)
    df1[i] = df1[i].str.replace(',', '.')
    df1[i] = df1[i].astype(float)

In [13]:
df1.dtypes

DATA (YYYY-MM-DD)                                        datetime64[ns]
HORA (UTC)                                                       object
PRECIPITAÇÃO TOTAL, HORÁRIO (mm)                                float64
PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)           float64
PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB)                 float64
PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB)                float64
RADIACAO GLOBAL (KJ/m²)                                         float64
TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)                    float64
TEMPERATURA DO PONTO DE ORVALHO (°C)                            float64
TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)                      float64
TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)                      float64
TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)                float64
TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)                float64
UMIDADE REL. MAX. NA HORA ANT. (AUT) (%)                        

## 1.3 - Missing Values

In [14]:
total_values = df1.shape[0]
missing_values = df1.isnull().sum()
percent_missing = (missing_values / total_values) * 100

missing_data_table = pd.DataFrame({
    'Total': total_values,
    'Missing': missing_values,
    'Percent Missing': percent_missing
})

missing_data_table['Percent Missing'] = missing_data_table['Percent Missing'].map(lambda x: f'{x:.2f}%')
missing_data_table

Unnamed: 0,Total,Missing,Percent Missing
DATA (YYYY-MM-DD),192840,0,0.00%
HORA (UTC),192840,0,0.00%
"PRECIPITAÇÃO TOTAL, HORÁRIO (mm)",192840,2296,1.19%
"PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)",192840,2196,1.14%
PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB),192840,2201,1.14%
PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB),192840,2201,1.14%
RADIACAO GLOBAL (KJ/m²),192840,20770,10.77%
"TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)",192840,962,0.50%
TEMPERATURA DO PONTO DE ORVALHO (°C),192840,962,0.50%
TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C),192840,969,0.50%


# 2.0 - Feature engineering

In [15]:
df2 = df1.copy()

In [16]:
df2['HORA (UTC)'] = pd.to_timedelta(df2['HORA (UTC)'] + ':00')
df2['Data_Hora'] = df2['DATA (YYYY-MM-DD)'] + df2['HORA (UTC)']

df2['Data_Hora'] = pd.to_datetime(df2['Data_Hora'])

In [17]:
# creating columns 'date, hour, dayofweek, quarter, month, year, dayofyear, dayofmonth, weekofyear, season'

df2 = create_features(df2)

# 3.0 - Data Cleaning

In [18]:
df3 = df2.copy()

## 3.1 - Replace -9999 values for nan

In [19]:
df3 = df3.replace(-9999, np.nan)

## 3.2 - Removing columns

In [20]:
columns_ordered = [
    'Data_Hora', 'hour', 'dayofweek', 'quarter', 'month', 'year',
    'dayofyear', 'dayofmonth', 'weekofyear', 'season',
    'TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)'
]

# Reindexar o DataFrame com a ordem desejada das colunas
df3 = df3.reindex(columns=columns_ordered)

## 3.4 - Missing values

In [21]:
imputer = SimpleImputer(strategy='mean')

df3['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)'] = imputer.fit_transform(df3[['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)']])

df3.isnull().sum()

Data_Hora                                       0
hour                                            0
dayofweek                                       0
quarter                                         0
month                                           0
year                                            0
dayofyear                                       0
dayofmonth                                      0
weekofyear                                      0
season                                          0
TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)    0
dtype: int64

# 4.0 - Exploratory Data Analysis

In [22]:
df4 = df3.copy()

# 5.0 - Data Preparation

In [23]:
df5 = df4.copy()

In [24]:
df5 = df5.set_index('Data_Hora')

In [25]:
mms = MinMaxScaler()

df5['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)'] = mms.fit_transform(df5[['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)']])

# 6.0 - Feature Selection

In [26]:
df6 = df5.copy()

In [27]:
# df6 = df6[['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)']].copy()

In [28]:
# train = df6.loc[df6.index < '2019-01-01']
# test = df6.loc[df6.index >= '2019-01-01']

In [44]:
# n_input = 24
# X_train, y_train = create_sequences(train['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)'].values, n_input)
# X_test, y_test = create_sequences(test['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)'].values, n_input)

In [45]:
# X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
# X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

In [38]:
def create_lagged_features(data, n_lags):
    df = pd.DataFrame(data)
    columns = [df.shift(i) for i in range(n_lags, 0, -1)]
    columns.append(df)  # Adiciona a coluna original como a última coluna
    df = pd.concat(columns, axis=1)
    df.dropna(inplace=True)
    return df

In [39]:
temperatures = df6['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)'].values

# Normalizar os dados
scaler = MinMaxScaler(feature_range=(0, 1))
temperatures_scaled = scaler.fit_transform(temperatures.reshape(-1, 1))

# Criar características com lags
n_lags = 24  # usar as últimas 24 horas
lagged_data = create_lagged_features(temperatures_scaled, n_lags)
X = lagged_data.values[:, :-1]
y = lagged_data.values[:, -1]

# Dividir em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [40]:
X_train[0]

array([0.47633136, 0.46745562, 0.4260355 , 0.41420118, 0.40236686,
       0.43195266, 0.44378698, 0.4704142 , 0.47928994, 0.47633136,
       0.51775148, 0.55621302, 0.59467456, 0.63905325, 0.67455621,
       0.68639053, 0.71301775, 0.73076923, 0.44798157, 0.58284024,
       0.56213018, 0.52071006, 0.5       , 0.48224852])

# 7.0 - Machine Learning

In [None]:
# Construir o modelo MLP
model = Sequential()
model.add(Dense(50, activation='relu', input_dim=n_lags))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Treinar o modelo
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/50
[1m3857/3857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 0.0067 - val_loss: 9.4252e-04
Epoch 2/50
[1m3857/3857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 0.0010 - val_loss: 8.1190e-04
Epoch 3/50
[1m3857/3857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 9.9114e-04 - val_loss: 8.3585e-04
Epoch 4/50
[1m3857/3857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 9.9893e-04 - val_loss: 8.1050e-04
Epoch 5/50
[1m3857/3857[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 9.9525e-04 - val_loss: 8.4404e-04
Epoch 6/50
[1m 437/3857[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 1ms/step - loss: 9.9811e-04

# 8.0 - Model Evaluation

In [None]:
# Previsão iterativa para o próximo mês (720 horas)
forecast = []
input_seq = X_test[0]  # última sequência de entrada disponível no conjunto de teste

for _ in range(720):
    yhat = model.predict(input_seq.reshape(1, n_lags), verbose=0)
    forecast.append(yhat[0, 0])
    input_seq = np.append(input_seq[1:], yhat[0, 0]).reshape(-1, 1)  # atualizar a sequência de entrada com a previsão mais recente

# Rescalar as previsões de volta à escala original
forecast_rescaled = scaler.inverse_transform(np.array(forecast).reshape(-1, 1))

# Comparar com y_test rescalado
y_test_rescaled = scaler.inverse_transform(y_test[:720].reshape(-1, 1))

# Calcular RMSE para MLP
from sklearn.metrics import mean_squared_error
rmse_mlp = mean_squared_error(y_test_rescaled, forecast_rescaled, squared=False)

print(f'RMSE MLP: {rmse_mlp}')

In [None]:
# Plotar a perda de treinamento e validação ao longo das épocas
plt.plot(history.history['loss'], label='Treinamento')
plt.plot(history.history['val_loss'], label='Validação')
plt.title('Perda do Modelo')
plt.xlabel('Época')
plt.ylabel('Perda')
plt.legend()
plt.show()

In [None]:
dates = df6.index[-len(y_test_rescaled):]  # Últimas datas correspondentes aos dados de teste

plt.figure(figsize=(24, 6))  # Ajuste aqui o tamanho (largura, altura)

# Plota os dados reais e previstos com datas como rótulos do eixo x
plt.plot(dates, y_test_rescaled, label='True', linewidth=0.5)
plt.plot(dates, predictions_rescaled, label='Predicted', linewidth=0.5)

plt.xlabel('Data')
plt.ylabel('Temperatura (°C)')
plt.title('True vs Predicted Temperature (°C)')
plt.legend()
plt.tight_layout()
plt.xticks(rotation=45)

plt.show()