# Analise de dados imputados    

In [6]:
from statistics import LinearRegression
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from fancyimpute import SoftImpute, KNN as FancyKNN, IterativeImputer as FancyIterativeImputer
from statsmodels.tsa.api import ExponentialSmoothing
from scipy.interpolate import Akima1DInterpolator
import pandas as pd
from sklearn.neural_network import MLPRegressor

# Carregando e preparando os dados
df_original = pd.read_csv('../dados_tratados/combinado/Piratininga/Piratininga_tratado_combinado.csv',
                          usecols=['PM2.5', 'Data e Hora', 'PM10', 'Monóxido de Carbono'],
                          parse_dates=['Data e Hora'],
                          index_col='Data e Hora',
                          low_memory=False)

df_original.sort_index(inplace=True)

# Selecionando apenas PM2.5 e focando em um período específico
df = df_original['PM2.5'].loc['2019-01-01':'2021-01-31']
df = pd.to_numeric(df, errors='coerce')
df = df.asfreq('h')

# Encontrar a sequência mais longa de dados não ausentes
mask = df.notna()
id_groups = mask.ne(mask.shift()).cumsum()
longest_sequence = df[mask].groupby(id_groups).filter(lambda x: len(x) == mask.groupby(id_groups).size().max())

print(f"Tamanho da sequência mais longa: {len(longest_sequence)}")
print(f"Número de valores não nulos na sequência mais longa: {longest_sequence.notna().sum()}")

if len(longest_sequence) == 0:
    raise ValueError("A sequência mais longa está vazia. Verifique seus dados.")

# Métodos de imputação
class LinearRegressionImputer:
    def fit_transform(self, X):
        X = X.values.reshape(-1, 1)
        mask = np.isnan(X.flatten())
        if np.all(mask):
            return X.flatten()
        # Usar apenas os dados não ausentes
        y = X[~mask].flatten()
        X_notna = np.arange(len(X))[~mask].reshape(-1, 1)
        reg = LinearRegression().fit(X_notna, y)
        X_all = np.arange(len(X)).reshape(-1, 1)
        y_pred = reg.predict(X_all)
        return np.where(mask, y_pred, X.flatten())

methods = {
    'Original': None,
    'MICE': IterativeImputer(random_state=0, max_iter=1000),
    'Akima Interpolation': lambda x: pd.Series(Akima1DInterpolator(x.dropna().index, x.dropna().values)(x.index), index=x.index),
    'Forward Fill': lambda x: x.fillna(method='ffill'),
    'Backward Fill': lambda x: x.fillna(method='bfill'),
    'Linear Interpolation': lambda x: x.interpolate(method='linear'),
    'Polynomial Interpolation': lambda x: x.interpolate(method='polynomial', order=2),

    'KNN': KNNImputer(n_neighbors=5),
    'Mean': SimpleImputer(strategy='mean'),
    'Median': SimpleImputer(strategy='median'),
    'Exponential Smoothing': lambda x: ExponentialSmoothing(x.fillna(x.mean())).fit().fittedvalues,
    'Random Forest': IterativeImputer(estimator=RandomForestRegressor(n_estimators=1000), random_state=0, max_iter=100),
    # 'Soft Impute': lambda x: pd.Series(SoftImpute(max_iters=1000, verbose=False).fit_transform(x.values.reshape(-1, 1)).flatten(), index=x.index),
    # 'KNN (FancyImpute)': lambda x: pd.Series(FancyKNN(k=5).fit_transform(x.values.reshape(-1, 1)).flatten(), index=x.index),
    # 'MICE (FancyImpute)': lambda x: pd.Series(FancyIterativeImputer().fit_transform(x.values.reshape(-1, 1)).flatten(), index=x.index),
    # 'Spline Interpolation': lambda x: x.interpolate(method='spline', order=3),
    'Rolling Mean (24h)': lambda x: x.fillna(x.rolling(24, min_periods=1).mean()),
    # 'Neural Network': lambda x: pd.Series(MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000).fit(np.arange(len(x)).reshape(-1, 1), x.fillna(x.mean())).predict(np.arange(len(x)).reshape(-1, 1)), index=x.index),
}

# Função para avaliar os métodos de imputação
def evaluate_imputation(original, imputed):
    # Remover NaN de ambas as séries antes de calcular as métricas
    mask = ~np.isnan(original) & ~np.isnan(imputed)
    original = original[mask]
    imputed = imputed[mask]
    
    if len(original) == 0 or len(imputed) == 0:
        print("Aviso: Não há dados válidos para comparação após a remoção de NaN.")
        return np.nan, np.nan
    
    mse = mean_squared_error(original, imputed)
    mae = mean_absolute_error(original, imputed)
    return mse, mae

# Criar dados artificiais com valores ausentes na sequência mais longa
test_data = longest_sequence.copy()
np.random.seed(0)
mask = np.random.rand(len(test_data)) < 0.1
test_data[mask] = np.nan

print(f"Porcentagem de valores ausentes nos dados de teste: {test_data.isna().mean()*100:.2f}%")

# Avaliar cada método
results = {}
for method_name, method in methods.items():
    if method_name == 'Original':
        continue
    
    try:
        if isinstance(method, SimpleImputer) or isinstance(method, IterativeImputer) or isinstance(method, KNNImputer):
            imputed = pd.Series(method.fit_transform(test_data.values.reshape(-1, 1)).flatten(), index=test_data.index)
        else:
            imputed = method(test_data)
        
        mse, mae = evaluate_imputation(longest_sequence, imputed)
        results[method_name] = {'MSE': mse, 'MAE': mae}
    except Exception as e:
        print(f"Erro ao avaliar o método {method_name}: {str(e)}")
        results[method_name] = {'MSE': np.nan, 'MAE': np.nan}

# Ordenar os resultados pelo MSE
results_df = pd.DataFrame(results).T.sort_values('MSE')

print("Resultados ordenados pelo MSE:")
print(results_df)

# Plotar os resultados de MSE e MAE
if not results_df.empty and not results_df['MSE'].isnull().all():
    fig = make_subplots(rows=1, cols=2, subplot_titles=('Mean Squared Error', 'Mean Absolute Error'))

    fig.add_trace(go.Bar(x=results_df.index, y=results_df['MSE'], name='MSE'), row=1, col=1)
    fig.add_trace(go.Bar(x=results_df.index, y=results_df['MAE'], name='MAE'), row=1, col=2)

    fig.update_layout(title='Comparação dos Métodos de Imputação', showlegend=False)
    fig.show()

# Plotar a série temporal original, com dados faltantes e imputados
fig = make_subplots(rows=1, cols=1, subplot_titles=('Comparação das Séries Temporais Imputadas'))

# Adicionar a série temporal original
fig.add_trace(go.Scatter(x=longest_sequence.index, y=longest_sequence, mode='lines', name='Original'))

# Adicionar a série com dados faltantes
fig.add_trace(go.Scatter(x=test_data.index, y=test_data, mode='lines', name='Dados Faltantes', line=dict(dash='dash')))

# Adicionar as séries imputadas
for method_name, method in methods.items():
    if method_name == 'Original':
        continue
    
    try:
        if isinstance(method, SimpleImputer) or isinstance(method, IterativeImputer) or isinstance(method, KNNImputer):
            imputed = pd.Series(method.fit_transform(test_data.values.reshape(-1, 1)).flatten(), index=test_data.index)
        else:
            imputed = method(test_data)
        
        fig.add_trace(go.Scatter(x=imputed.index, y=imputed, mode='lines', name=method_name))
    except Exception as e:
        print(f"Erro ao plotar o método {method_name}: {str(e)}")

fig.update_layout(title='Comparação das Séries Temporais Imputadas e Dados Faltantes',
                  xaxis_title='Data',
                  yaxis_title='PM2.5')
fig.show()


Tamanho da sequência mais longa: 397
Número de valores não nulos na sequência mais longa: 397
Porcentagem de valores ausentes nos dados de teste: 10.08%
Resultados ordenados pelo MSE:
                                MSE       MAE
Linear Interpolation       8.918836  0.506297
Akima Interpolation        9.056030  0.512051
Backward Fill              9.652393  0.559194
Polynomial Interpolation  10.770959  0.577359
Forward Fill              11.355164  0.498741
Rolling Mean (24h)        14.019528  0.636625
MICE                      16.164077  0.721440
KNN                       16.164077  0.721440
Mean                      16.164077  0.721440
Random Forest             16.164077  0.721440
Median                    17.065491  0.672544
Exponential Smoothing     71.546564  4.778376



Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


No frequency information was provided, so inferred frequency h will be used.




Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


No frequency information was provided, so inferred frequency h will be used.

