In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
# List of CSV files to merge
csv_files = [
    "data\_2010\_merged\merged_data.csv",
    "data\_2011\_merged\merged_data.csv",
    "data\_2012\_merged\merged_data.csv",
    "data\_2013\_merged\merged_data.csv",
    "data\_2014\_merged\merged_data.csv",
    "data\_2015\_merged\merged_data.csv",
    "data\_2016\_merged\merged_data.csv",
    "data\_2017\_merged\merged_data.csv",
    "data\_2018\_merged\merged_data.csv",
    "data\_2019\_merged\merged_data.csv",
    "data\_2020\_merged\merged_data.csv",
    "data\_2021\_merged\merged_data.csv",
    "data\_2022\_merged\merged_data.csv",
    "data\_2023\_merged\merged_data.csv",
    "data\_2024\_merged\merged_data.csv",
    ]  # Add more files if needed

# Read and merge files
df_list = [pd.read_csv(file) for file in csv_files]  # Read each CSV
new_merged_df = pd.concat(df_list, ignore_index=True)

In [None]:
new_merged_df

In [None]:
if os.path.exists("data/_15year_without_labelled.csv"):
    print("Files already exist")
else:
    new_merged_df.to_csv("data/_15year_without_labelled.csv")
    print("File created")

In [None]:
df = pd.read_csv("data/_15year_heat_labelled.csv")

In [None]:
# df = df.drop(columns=['Unnamed:0'], axis=1)
df

In [None]:
df = df.drop(columns=['Unnamed: 0'], axis=1)


In [None]:
X = df.drop(columns=['heatwave_label'], axis=1)
X

In [None]:
y = df['heatwave_label']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
models = {
    "LDA":LinearDiscriminantAnalysis(),
    "Lightgbm": LGBMClassifier(),
    "Ridge": RidgeClassifier(),
    "MLP": MLPClassifier(),
    "MultimodalNB": MultinomialNB(),
    "GaussianNB": GaussianNB(),
    "Bernoulli": BernoulliNB(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "SVM": SVC()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('\n')


In [None]:

pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
