In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings


In [65]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [66]:
df = pd.read_csv("data\_2024\_labelled\heat_labelled.csv")

In [67]:
df

Unnamed: 0,latitude,longitude,max_temperature,mean_temperature,max_precipitation,mean_precipitation,mslp,month,day_of_year,year,heatwave_label
0,18.80,72.8,300.28345,298.79312,0.0,0.0,101279.910,1,1,2024,0
1,19.05,72.8,300.91122,298.72922,0.0,0.0,101281.370,1,1,2024,0
2,18.80,72.8,300.19160,298.49875,0.0,0.0,101224.320,1,2,2024,0
3,19.05,72.8,300.74103,298.40060,0.0,0.0,101227.850,1,2,2024,0
4,18.80,72.8,299.84067,298.13846,0.0,0.0,101160.195,1,3,2024,0
...,...,...,...,...,...,...,...,...,...,...,...
727,19.05,72.8,300.16570,298.23690,0.0,0.0,101466.620,12,364,2024,0
728,18.80,72.8,300.97717,298.98740,0.0,0.0,101442.690,12,365,2024,0
729,19.05,72.8,301.34230,299.03720,0.0,0.0,101448.500,12,365,2024,0
730,18.80,72.8,300.60540,299.00990,0.0,0.0,101487.400,12,366,2024,0


In [68]:
X = df.drop(columns=['heatwave_label'], axis=1)
X

Unnamed: 0,latitude,longitude,max_temperature,mean_temperature,max_precipitation,mean_precipitation,mslp,month,day_of_year,year
0,18.80,72.8,300.28345,298.79312,0.0,0.0,101279.910,1,1,2024
1,19.05,72.8,300.91122,298.72922,0.0,0.0,101281.370,1,1,2024
2,18.80,72.8,300.19160,298.49875,0.0,0.0,101224.320,1,2,2024
3,19.05,72.8,300.74103,298.40060,0.0,0.0,101227.850,1,2,2024
4,18.80,72.8,299.84067,298.13846,0.0,0.0,101160.195,1,3,2024
...,...,...,...,...,...,...,...,...,...,...
727,19.05,72.8,300.16570,298.23690,0.0,0.0,101466.620,12,364,2024
728,18.80,72.8,300.97717,298.98740,0.0,0.0,101442.690,12,365,2024
729,19.05,72.8,301.34230,299.03720,0.0,0.0,101448.500,12,365,2024
730,18.80,72.8,300.60540,299.00990,0.0,0.0,101487.400,12,366,2024


In [69]:
y = df['heatwave_label']
y

0      0
1      0
2      0
3      0
4      0
      ..
727    0
728    0
729    0
730    0
731    0
Name: heatwave_label, Length: 732, dtype: int64

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape, X_test.shape

((512, 10), (220, 10))

In [71]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [72]:
models = {
    "Ridge": RidgeClassifier(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "SVM": SVC()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.2997
- Mean Absolute Error: 0.0898
- R2 Score: 0.1439
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2023
- Mean Absolute Error: 0.0409
- R2 Score: 0.2642


K-Neighbors Classifier
Model performance for Training set
- Root Mean Squared Error: 0.2864
- Mean Absolute Error: 0.0820
- R2 Score: 0.2183
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.3233
- Mean Absolute Error: 0.1045
- R2 Score: -0.8803


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0674
- Mean Absolute Error: 0.0045
- R2 Score: 0.9182


Random Forest Classifier
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
------

In [73]:

pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
4,XGBClassifier,1.0
5,CatBoosting Classifier,1.0
2,Decision Tree,0.918246
3,Random Forest Classifier,0.918246
6,AdaBoost Classifier,0.918246
0,Ridge,0.264214
7,SVM,-0.062802
1,K-Neighbors Classifier,-0.880342
