In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA


In [7]:
df = pd.read_csv(r"data\labelled\santacruz_labelled_days.csv")
df['DATE'] = pd.to_datetime(df['DATE'])
df

Unnamed: 0,DATE,DPT,WBT,DBT,Normal_Temp,Heatwave,Heatwave_Days
0,2010-01-01,16.0875,19.775,25.725000,30.4,0.0,0
1,2010-01-02,18.2000,21.375,26.975000,30.4,0.0,0
2,2010-01-03,19.7375,22.025,26.300000,30.4,0.0,0
3,2010-01-04,18.0250,20.450,24.575000,30.4,0.0,0
4,2010-01-05,18.5500,20.575,24.175000,30.4,0.0,0
...,...,...,...,...,...,...,...
5474,2024-12-27,13.4625,27.400,21.847186,31.9,0.0,0
5475,2024-12-28,13.4625,27.400,21.261927,31.9,0.0,0
5476,2024-12-29,13.4625,27.400,21.481941,31.9,0.0,0
5477,2024-12-30,13.4625,27.400,21.161551,31.9,0.0,0


In [8]:
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False)


In [9]:
# Fit ARIMA on DBT (for forecasting purposes, though DBT is available)
arima_model = ARIMA(train_df['DBT'], order=(5,1,0))
arima_result = arima_model.fit()


In [10]:
# Forecast DBT for the test set period
forecasted_dbt = arima_result.forecast(steps=len(test_df))
test_df = test_df.copy()
test_df['DBT_ARIMA'] = forecasted_dbt.values

In [11]:
# Prepare training features (with actual DBT)
X_train = train_df[['DPT', 'WBT', 'DBT', 'Normal_Temp']].copy()
# Prepare test features (replace actual DBT with forecasted DBT_ARIMA)
X_test = test_df[['DPT', 'WBT', 'Normal_Temp']].copy()
X_test['DBT'] = test_df['DBT_ARIMA']


In [12]:
X_test = X_test[['DPT', 'WBT', 'DBT', 'Normal_Temp']]


In [13]:
# Labels
y_train = train_df['Heatwave_Days'].astype(int)
y_test = test_df['Heatwave_Days'].astype(int)


In [14]:
# Train RF model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [15]:
# Predict
rf_preds = rf_model.predict(X_test)


In [27]:
conf_matrix = confusion_matrix(y_test, rf_preds)
class_report = classification_report(y_test, rf_preds, output_dict=False)

In [29]:
# Extract TP, FP, FN, TN
tn, fp, fn, tp = conf_matrix.ravel()

metrics_summary = {
    "Accuracy": (tp + tn) / (tp + tn + fp + fn),
    "Precision (Heatwave)": tp / (tp + fp) if (tp + fp) != 0 else 0,
    "Recall (Heatwave)": tp / (tp + fn) if (tp + fn) != 0 else 0,
    "F1 Score (Heatwave)": 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0
}
print(metrics_summary)

{'Accuracy': 0.9397810218978102, 'Precision (Heatwave)': 0.07017543859649122, 'Recall (Heatwave)': 0.23529411764705882, 'F1 Score (Heatwave)': 0.10810810810810811}
