In [None]:
import os
import pandas as pd
from sklearn import set_config
import numpy as np

In [None]:
set_config(transform_output='pandas')
pd.set_option('display.float_format', '{:.4f}'.format)

In [None]:
path = os.path.join('data', 'csv', 'forestfires.csv')
fire_df = pd.read_csv(path)
fire_df['log_area'] = np.log1p(fire_df['area'])

In [None]:
# Set X and y
X = fire_df.drop(columns = ['X', 'Y','area', 'log_area'])
y = fire_df.pop('log_area')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Customized OnehotEncoding of months and days by taiking into account the cyclical factor
from sklearn.base import BaseEstimator, TransformerMixin

class MonthDayEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cyclical=True):
        self.cyclical = cyclical
        self.month_map = {
            'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
            'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
            'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
        }

        self.day_map = {
            'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
        }

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assumes X is a DataFrame or 2D array with 'month' in column 0 and 'day' in column 1
        month = X['month'].apply(lambda m: self.month_map.get(str(m).lower(), m)).astype(int)
        day = X['day'].apply(lambda d: self.day_map.get(str(d).lower(), d)).astype(int)

        if self.cyclical:
            # Cyclical encoding
            month_sin = np.sin(2 * np.pi * month / 12)
            month_cos = np.cos(2 * np.pi * month / 12)
            day_sin = np.sin(2 * np.pi * day / 31)
            day_cos = np.cos(2 * np.pi * day / 31)
            return np.stack([month_sin, month_cos, day_sin, day_cos], axis=1)
        else:
            # Simple one-hot encoding
            df = pd.DataFrame({'month': month, 'day': day})
            return pd.get_dummies(df).values


In [None]:
# Display
from tabulate import tabulate
from sklearn.metrics import root_mean_squared_error, median_absolute_error

def display_fun(y_train, y_test, y_train_pred, y_test_pred):

    scores_df = pd.DataFrame(
        columns = ['RMSE', 'MAD'],
        index = ['Test', 'Train']
    )

    scores_df.loc['Test',:] = (
        root_mean_squared_error(y_test, y_test_pred),
        median_absolute_error(y_test, y_test_pred)
    )

    scores_df.loc['Train',:] = (
        root_mean_squared_error(y_train, y_train_pred),
        median_absolute_error(y_train, y_train_pred)
    )

    print(f'{tabulate(scores_df, headers = 'keys')}\n')

In [None]:
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, mutual_info_regression, RFECV, SelectFromModel
from sklearn.svm import SVR


from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler



numeric_features = X.drop(columns=['month', 'day']).columns.tolist()
weather_features = ['temp', 'RH', 'wind', 'rain']


preprocessor = ColumnTransformer([
    # ('weather', StandardScaler(), weather_features),
    ('month_day', MonthDayEncoder(cyclical=True), ['month', 'day']),
    ('num', MinMaxScaler(), numeric_features)
])


# 11, learning_rate = 0.2711, alpha = .1,  max_depth  = 4
xgb_model = Pipeline([
    ('preprocessing', preprocessor),
    ('feature', SelectKBest(score_func=f_regression, k=10)),
    ('xgbregressor',XGBRegressor(n_estimators = 50,learning_rate = 0.34, max_depth  = 8))
])

tree_model = Pipeline([
    ('preprocessing', preprocessor),
    # ('feature', SelectKBest(score_func=f_regression, k=8)),
    ('tree', DecisionTreeRegressor(max_depth  =7, random_state=42, criterion = 'absolute_error', max_features = 10))
])

forest_model = Pipeline([
    ('preprocessing', preprocessor),
    # ('feature', SelectKBest(score_func=f_regression, k=3)),
    ('tree', RandomForestRegressor(n_estimators = 10, max_depth  =8, random_state=42, max_features = 7))
])

dummy_model = Pipeline([
    ('preprocessing', preprocessor),
    ('dummy', DummyRegressor())
])

svm_model = Pipeline([
    ('preprocessing', preprocessor),  # Should select & scale temp, RH, wind, rain
    ('svr', SVR(kernel='rbf', C=10, epsilon=0.1))  # RBF kernel is standard for regression
])

adab_model = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('adaregressor', AdaBoostRegressor(
        estimator = RandomForestRegressor(max_depth  =5, random_state=42)
    )
    )
])


model = tree_model

model.fit(X_train, y_train)

y_train_pred_log =  model.predict(X_train)
y_test_pred_log =  model.predict(X_test)

y_test_exp = np.expm1(y_test)
y_train_exp = np.expm1(y_train)

y_train_pred = np.expm1(y_train_pred_log)
y_test_pred = np.expm1(y_test_pred_log)

display_fun(y_train_exp, y_test_exp, y_train_pred, y_test_pred)
# display_fun(y_train, y_test, y_train_pred, y_test_pred)