In [1]:
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import BayesianRidge


In [None]:

def load_prepared_data(folder_path):
    df = pd.read_parquet(folder_path)
    return df


In [None]:

def load_all_data_from_folder(folder_path):
    all_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".csv")])
    df_list = [pd.read_csv(os.path.join(folder_path, file), parse_dates=["data_hora_gmt"]) for file in all_files]
    df = pd.concat(df_list, ignore_index=True)
    return df

def feature_engineering(df):
    df=df[df['risco_fogo']>0]
    df["day_of_year"] = df["data_hora_gmt"].dt.dayofyear
    df["month"] = df["data_hora_gmt"].dt.month
    df["weekday"] = df["data_hora_gmt"].dt.weekday
    df["hour"] = df["data_hora_gmt"].dt.hour
    df["cos_day_of_year"] = encode_day_of_year_cyclic(df["day_of_year"])
    return df

def prepare_data(df):
    df = df.drop(columns=["id", "municipio_id", "estado_id", "pais_id"], errors='ignore')
    if 'risco_fogo' in df.columns:
        df = df.dropna(subset=["risco_fogo"])
    df = feature_engineering(df)

    df = df.drop(columns=["data_hora_gmt", "municipio", "estado", "pais"], errors='ignore')
    df = pd.get_dummies(df, columns=["satelite", "bioma"], drop_first=True)
    df = df.fillna(0)

    return df

def split_train_val(df):
    df = df.sort_values(by="day_of_year")
    unique_days = df["day_of_year"].unique()
    split_day = sorted(unique_days)[-30]  # Last month = last 30 days
    train_df = df[df["day_of_year"] < split_day]
    val_df = df[df["day_of_year"] >= split_day]

    X_train = train_df.drop(columns=["risco_fogo"])
    y_train = train_df["risco_fogo"]
    X_val = val_df.drop(columns=["risco_fogo"])
    y_val = val_df["risco_fogo"]

    return X_train, X_val, y_train, y_val

def load_and_prepare_data(folder_path):
    df = load_all_data_from_folder(folder_path)
    df = prepare_data(df)
    return split_train_val(df)


In [None]:
models = {
    "Lasso": Lasso(alpha=0.1),
    "DecisionTree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "NeuralNetwork": MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=500, random_state=42),
    "Elastic_Net":ElasticNet(alpha=1.0, l1_ratio=0.5),
    "BayesianRidge":BayesianRidge(),
}


In [4]:
# train_model.py
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib


def evaluate_model(name, model, X_val, y_val):
    y_pred = model.predict(X_val)
    print(f"🔍 {name}")
    print(f"MAE: {mean_absolute_error(y_val, y_pred):.4f}")
    print(f"RMSE: {mean_squared_error(y_val, y_pred):.4f}")
    print(f"R² Score: {r2_score(y_val, y_pred):.4f}")
    print("-" * 40)
    return mean_squared_error(y_val, y_pred), model

def train_model(folder_path):
    X_train, X_val, y_train, y_val = load_and_prepare_data(folder_path)

    best_metric = float("inf")
    best_model = None
    best_name = ""

    for name, model in models.items():
        model.fit(X_train, y_train)
        metric, trained_model = evaluate_model(name, model, X_val, y_val)
        if metric < best_metric:
            best_metric = metric
            best_model = trained_model
            best_name = name

    joblib.dump(best_model, "fire_risk_model.pkl")
    print(f"✅ Best model ({best_name}) saved to fire_risk_model.pkl")

if __name__ == "__main__":
    train_model("data/")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["day_of_year"] = df["data_hora_gmt"].dt.dayofyear
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["data_hora_gmt"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["weekday"] = df["data_hora_gmt"].dt.weekday
A value is trying to be set on a copy of a slice from a DataFr

🔍 Lasso
MAE: 0.3604
RMSE: 0.2141
R² Score: -0.4437
----------------------------------------
🔍 DecisionTree
MAE: 0.2199
RMSE: 0.0870
R² Score: 0.4137
----------------------------------------
🔍 RandomForest
MAE: 0.2161
RMSE: 0.0836
R² Score: 0.4361
----------------------------------------
🔍 GradientBoosting
MAE: 0.2024
RMSE: 0.0713
R² Score: 0.5192
----------------------------------------
🔍 KNN
MAE: 0.2098
RMSE: 0.0889
R² Score: 0.4008
----------------------------------------
🔍 NeuralNetwork
MAE: 0.1972
RMSE: 0.0764
R² Score: 0.4846
----------------------------------------
🔍 Elastic_Net
MAE: 0.3890
RMSE: 0.2333
R² Score: -0.5733
----------------------------------------
🔍 BayesianRidge
MAE: 0.3057
RMSE: 0.1481
R² Score: 0.0011
----------------------------------------
✅ Best model (GradientBoosting) saved to fire_risk_model.pkl
