# Forecasting O₃ (Ozônio) - Horário
## Preparação dos Dados

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

df = pd.read_csv("la_o3_1000points.csv", parse_dates=['timestamp'])
df = df.set_index('timestamp')

train = df.iloc[:-48]
test = df.iloc[-48:]

print(f"Tamanho treino: {len(train)} | Tamanho teste: {len(test)}")

## SARIMA

In [None]:
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.statespace.sarimax import SARIMAX

model_sarima = SARIMAX(train['o3_ug_m3'],
                       order=(1,1,1),
                       seasonal_order=(1,1,1,24),
                       enforce_stationarity=False,
                       enforce_invertibility=False)
sarima_fit = model_sarima.fit(disp=False)

sarima_pred = sarima_fit.forecast(steps=48)
plt.plot(test.index, test['o3_ug_m3'], label='Real')
plt.plot(test.index, sarima_pred, label='SARIMA')
plt.legend()
plt.title('Previsão SARIMA - O₃')
plt.show()

sarima_mae = mean_absolute_error(test['o3_ug_m3'], sarima_pred)
sarima_rmse = np.sqrt(mean_squared_error(test['o3_ug_m3'], sarima_pred))
print(f"SARIMA -> MAE: {sarima_mae:.2f} | RMSE: {sarima_rmse:.2f}")

## Holt-Winters

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

model_hw = ExponentialSmoothing(train['o3_ug_m3'],
                                trend='add',
                                seasonal='add',
                                seasonal_periods=24)
hw_fit = model_hw.fit()
hw_pred = hw_fit.forecast(48)

plt.plot(test.index, test['o3_ug_m3'], label='Real')
plt.plot(test.index, hw_pred, label='Holt-Winters')
plt.legend()
plt.title('Previsão Holt-Winters - O₃')
plt.show()

hw_mae = mean_absolute_error(test['o3_ug_m3'], hw_pred)
hw_rmse = np.sqrt(mean_squared_error(test['o3_ug_m3'], hw_pred))
print(f"Holt-Winters -> MAE: {hw_mae:.2f} | RMSE: {hw_rmse:.2f}")

## Prophet

In [None]:
from prophet import Prophet

train_no_tz = train.copy()
train_no_tz.index = train_no_tz.index.tz_localize(None)

test_no_tz = test.copy()
test_no_tz.index = test_no_tz.index.tz_localize(None)

df_prophet = train_no_tz.reset_index().rename(columns={'timestamp': 'ds', 'o3_ug_m3': 'y'})

model_prophet = Prophet(daily_seasonality=True, weekly_seasonality=True)
model_prophet.fit(df_prophet)

future = pd.DataFrame({'ds': test_no_tz.index})
forecast = model_prophet.predict(future)

plt.figure(figsize=(10,4))
plt.plot(test_no_tz.index, test_no_tz['o3_ug_m3'], label='Real')
plt.plot(test_no_tz.index, forecast['yhat'], label='Prophet')
plt.legend()
plt.title('Previsão Prophet - O₃')
plt.show()

prophet_mae = mean_absolute_error(test_no_tz['o3_ug_m3'], forecast['yhat'])
prophet_rmse = np.sqrt(mean_squared_error(test_no_tz['o3_ug_m3'], forecast['yhat']))
print(f"Prophet -> MAE: {prophet_mae:.2f} | RMSE: {prophet_rmse:.2f}")

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

def create_lag_features(data, lags=24):
    df_lag = data.copy()
    for lag in range(1, lags+1):
        df_lag[f'lag_{lag}'] = df_lag['o3_ug_m3'].shift(lag)
    df_lag = df_lag.dropna()
    return df_lag

df_lag = create_lag_features(df)
train_lag = df_lag.iloc[:-48]
test_lag = df_lag.iloc[-48:]

X_train = train_lag.drop('o3_ug_m3', axis=1)
y_train = train_lag['o3_ug_m3']
X_test = test_lag.drop('o3_ug_m3', axis=1)
y_test = test_lag['o3_ug_m3']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)

plt.plot(test.index, y_test, label='Real')
plt.plot(test.index, rf_pred, label='Random Forest')
plt.legend()
plt.title('Previsão Random Forest - O₃')
plt.show()

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
print(f"Random Forest -> MAE: {rf_mae:.2f} | RMSE: {rf_rmse:.2f}")

## Comparação de Modelos

In [None]:
results = pd.DataFrame({
    'Modelo': ['SARIMA', 'Holt-Winters', 'Prophet', 'Random Forest'],
    'MAE': [sarima_mae, hw_mae, prophet_mae, rf_mae],
    'RMSE': [sarima_rmse, hw_rmse, prophet_rmse, rf_rmse]
})
print(results.sort_values('RMSE'))

## Treinar Modelo Final (Random Forest) em Todo o Dataset

In [None]:
import joblib
from sklearn.pipeline import Pipeline

LAGS = 24
MODEL_PATH = "rf_o3_pipeline.joblib"

def create_lag_features(series: pd.Series, lags: int = LAGS) -> pd.DataFrame:
    df_lag = pd.DataFrame({"o3_ug_m3": series})
    for lag in range(1, lags + 1):
        df_lag[f"lag_{lag}"] = df_lag["o3_ug_m3"].shift(lag)
    return df_lag.dropna()

df = pd.read_csv("la_o3_1000points.csv", parse_dates=["timestamp"])
df = df.set_index("timestamp").sort_index()

if df.index.tz is not None:
    df.index = df.index.tz_localize(None)

df_lag = create_lag_features(df["o3_ug_m3"], lags=LAGS)
X = df_lag.drop(columns=["o3_ug_m3"]).values
y = df_lag["o3_ug_m3"].values

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
])
pipe.fit(X, y)

joblib.dump(pipe, MODEL_PATH)
print(f"Modelo salvo em: {MODEL_PATH} (treinado com {len(y)} amostras)")

## Função de Previsão para Próximas 24 Horas

In [None]:
import io

LAGS = 24
MODEL_PATH = "rf_o3_pipeline.joblib"

def _load_csv(csv_input):
    if isinstance(csv_input, str) and "\n" in csv_input:
        df = pd.read_csv(io.StringIO(csv_input), parse_dates=["timestamp"])
    else:
        df = pd.read_csv(csv_input, parse_dates=["timestamp"])
    df = df.set_index("timestamp").sort_index()
    if df.index.tz is not None:
        df.index = df.index.tz_localize(None)
    return df

def predict_next_24_from_csv(csv_input, model_path: str = MODEL_PATH, decimals: int = 2):
    df = _load_csv(csv_input)
    series = df["o3_ug_m3"].astype(float).copy()

    if len(series) < LAGS:
        raise ValueError(f"São necessários pelo menos {LAGS} pontos para iniciar a previsão.")

    pipe = joblib.load(model_path)

    window = list(series.iloc[-LAGS:].values)
    preds = []

    for _ in range(24):
        features = np.array(window[::-1])
        features = features.reshape(1, -1)
        y_hat = float(pipe.predict(features)[0])
        preds.append(y_hat)

        window.append(y_hat)
        window = window[-LAGS:]

    fmt = f"{{:.{decimals}f}}"
    out = ",".join(fmt.format(v) for v in preds)
    return out

In [None]:
preds_str = predict_next_24_from_csv("la_o3_1000points.csv")
print(preds_str)