# Problem setting

The administration of the nature park Montesinho in north-east Portugal wants to predict wild fires based on wheather data of the Fire-Wheather-Index (FWI). The aim is to reco- gnize the affected area and consequently the intensity of the imminent wild fire as early as possible in order to be able to adequatly assess the danger caused by the fire. To this aim, data from 517 wild fires have been collected. Develop a model that predicts the burnt forest area as accurately as possible from the given data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
data = pd.read_csv('fires.csv', sep=',')
data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
# Data Preprocessing

# Log transformation of the area (target variable) to reduce skewness
data['log_area'] = data['area'].apply(lambda x: np.log1p(x))    # log1p(x) = log(1 + x) to avoid log(0) = undefined/ -inf

# Define categorical columns and numerical columns
categorical_cols = ['month', 'day']
numerical_cols = ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']

# Split the data into features and target
X = data[categorical_cols + numerical_cols]
y = data['log_area']

# Used a column transformer to handle categorical and numerical data
# StandardScaler removes the mean and scales the data to unit variance for numerical columns
# OneHotEncoder encodes categorical columns into one hot encoded binary vectors for categorical columns.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# MODEL TRAINING and FITTING

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

# Baseline model using the average of the target variable as the predictions.
y_pred_base = np.full_like(y_test, y_train.mean())

# Linear Regression model using Pipeline to chain the preprocessing step and model as a single unit
lin_reg = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', LinearRegression())])

lin_reg.fit(X_train, y_train)

# Ridge regression model with cross-validation for model selection and hyperparameter (lambda) tuning
ridge_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge())
])

# Parameter grid for lambda (alpha)
param_grid = {
    'model__alpha': np.logspace(-4, 4, 50)  # Range of lambda values from 10^-4 to 10^4 (log scale).
}

# Nested cross-validation using GridSearchCV to find the best lambda (alpha)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV for hyperparameter tuning (inner loop CV)
# Use negative MSE as scoring metric because GridSearchCV maximizes the score and we want to minimize MSE.
grid_search = GridSearchCV(estimator=ridge_pipe, param_grid=param_grid, cv=inner_cv, scoring='neg_mean_squared_error')

# Perform outer loop CV to train and evaluate model performance
cv_results = cross_val_score(grid_search, X_train, y_train, cv=outer_cv, scoring='neg_mean_squared_error') 

# Calculate RMSE for each fold in the outer cross-validation.
rmse_scores = np.sqrt(-cv_results)

# I got some nan values in the rmse_scores, so I replaced them with 0
rmse_scores = np.nan_to_num(rmse_scores)

# Fit the GridSearchCV to find the best lambda on the whole training set
grid_search.fit(X_train, y_train)
best_lambda = grid_search.best_params_['model__alpha']
# print(f'Best Lambda (alpha): {best_lambda}')  # Best Lambda (alpha): 0.0001

# Train the final Ridge regression model using the best lambda
ridge_final = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge(alpha=best_lambda))
])

ridge_final.fit(X_train, y_train)

# Random Forest model: a more complex model that can capture non-linear relationships
rf_reg = Pipeline(steps=[('preprocessor', preprocessor),
                         ('model', RandomForestRegressor(n_estimators=100, random_state=42))])  #100 DTs in the forest

rf_reg.fit(X_train, y_train)


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
        

In [None]:
# MODEL EVALUATION
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the test set
y_pred_lin = lin_reg.predict(X_test)
y_pred_rf = rf_reg.predict(X_test)
y_pred_ridge = ridge_final.predict(X_test)

# Evaluation using Mean Absolute Error and Root Mean Squared Error
# MAE is invariant of the magnitude of the differences
# MSE is more sensitive to outliers (large differences)
# RMSE is the square root of the MSE, which is the same unit as the target variable by cancelling out the square using the square root.
# Coefficient of determination, R^2 measures how well the model's prediction matches the true value. 1- (sum(sq_loss)/sum(baseline))
# Comparison of the two models using MAE and RMSE gives a better understanding of the performance of the models in terms of errors.


