In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
from os.path import join
import re
import math
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import STL
from pyod.models.iforest import IForest
from sklearn.cluster import DBSCAN
from pyod.models.abod import ABOD

# Coleta de Dados

In [30]:
MODULE_PATH = Path().absolute().parent
DATA_PATH = join(MODULE_PATH, "sources/processed")

In [31]:
complete_data = pd.read_csv(join(DATA_PATH, "selected_features_complete_df.csv"))

In [32]:
complete_data["data"] = pd.to_datetime(complete_data["data"])

In [33]:
complete_data.head()

Unnamed: 0,ano,mes,data,precipitacao_total_media,temperatura_bulbo_hora_media,temperatura_orvalho_hora_media,umidade_rel_hora_media,categoria_El Ninõ,categoria_La Ninã,categoria_Neutro,precipitacao_total_media_outlier,temperatura_bulbo_hora_media_outlier,temperatura_orvalho_hora_media_outlier,umidade_rel_hora_media_outlier,total_outliers
0,2009,1,2009-01-01,0.1496,25.187439,20.299029,76.026835,0,1,0,0,0,0,0,0
1,2009,1,2009-01-02,0.305735,24.722086,20.084109,77.689743,0,1,0,0,0,0,0,0
2,2009,1,2009-01-03,0.434014,24.058518,19.749311,78.933681,0,1,0,1,0,0,0,1
3,2009,1,2009-01-04,0.245052,23.368056,18.715145,77.340627,0,1,0,0,0,0,0,0
4,2009,1,2009-01-05,0.201583,23.101135,17.945004,75.597683,0,1,0,0,0,0,0,0


# Pré-Processamento

## Definição do Pipeline

In [34]:
feature_cols = complete_data.columns.tolist()[3:10]
numerical_cols = feature_cols[:-3]
categorical_cols = feature_cols[-3:]

In [41]:
def split_data(
    df: pd.DataFrame,
    year_cap: int = 2024):

    df = df.copy()
    
    df_train = df[df["ano"] < year_cap]
    df_test = df[df["ano"] == year_cap]
    
    return df_train, df_test

In [35]:
def scale_data(
    df: pd.DataFrame, 
    columns: list[str]):

    df = df.copy()  
    
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    
    return df

In [69]:
def create_residual_df(
    df: pd.DataFrame,
    numerical_cols: list[str],
    seasonal_period: int = 7):

    residual_df = df.copy()

    for col in numerical_cols:
        feature_decomposition = STL(residual_df[col], period=seasonal_period).fit()
        residual_df[col] = residual_df[col] - feature_decomposition.seasonal.values

    return residual_df

In [81]:
def preprocess_pipeline(
    df: pd.DataFrame,
    numerical_cols: list[str],
    remove_seasonality: bool = False,
    seasonal_period: int = 7,
    year_cap: int = 2024):

    df = df.copy()

    if remove_seasonality:
        df = create_residual_df(
            df=df, 
            numerical_cols=numerical_cols,
            seasonal_period=seasonal_period)

    df_train, df_test = split_data(df=df, year_cap=year_cap)

    df_train = scale_data(df=df_train, columns=numerical_cols)
    df_test = scale_data(df=df_test, columns=numerical_cols)
    
    return df_train, df_test

In [82]:
df_train, df_test = preprocess_pipeline(
    df=complete_data,
    numerical_cols=numerical_cols,
    remove_seasonality=True,
    seasonal_period=7,
    year_cap=2024
)

In [83]:
df_train.head()

Unnamed: 0,ano,mes,data,precipitacao_total_media,temperatura_bulbo_hora_media,temperatura_orvalho_hora_media,umidade_rel_hora_media,categoria_El Ninõ,categoria_La Ninã,categoria_Neutro,precipitacao_total_media_outlier,temperatura_bulbo_hora_media_outlier,temperatura_orvalho_hora_media_outlier,umidade_rel_hora_media_outlier,total_outliers
0,2009,1,2009-01-01,0.585832,0.483621,0.799794,0.722738,0,1,0,0,0,0,0,0
1,2009,1,2009-01-02,0.637483,0.469706,0.676157,0.560195,0,1,0,0,0,0,0,0
2,2009,1,2009-01-03,0.918583,0.347749,0.560791,0.52705,0,1,0,1,0,0,0,1
3,2009,1,2009-01-04,0.229854,0.22341,0.473174,0.4849,0,1,0,0,0,0,0,0
4,2009,1,2009-01-05,0.723558,0.065939,0.426485,0.627639,0,1,0,0,0,0,0,0


In [84]:
df_test.head()

Unnamed: 0,ano,mes,data,precipitacao_total_media,temperatura_bulbo_hora_media,temperatura_orvalho_hora_media,umidade_rel_hora_media,categoria_El Ninõ,categoria_La Ninã,categoria_Neutro,precipitacao_total_media_outlier,temperatura_bulbo_hora_media_outlier,temperatura_orvalho_hora_media_outlier,umidade_rel_hora_media_outlier,total_outliers
5478,2024,1,2024-01-01,2.13804,0.21779,0.802275,1.011485,1,0,0,1,0,0,0,1
5479,2024,1,2024-01-02,1.922971,0.412355,0.980476,1.131216,1,0,0,0,0,0,0,0
5480,2024,1,2024-01-03,2.367351,0.672075,1.089376,1.048026,1,0,0,1,0,0,0,1
5481,2024,1,2024-01-04,1.797545,0.57334,1.008135,0.995842,1,0,0,1,0,0,0,1
5482,2024,1,2024-01-05,1.145157,0.589223,0.922849,0.843061,1,0,0,0,0,0,0,0


# Treinamento de Modelos