In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
# List of CSV files to merge
csv_files = [
    "data\_2010\_merged\merged_data.csv",
    "data\_2011\_merged\merged_data.csv",
    "data\_2012\_merged\merged_data.csv",
    "data\_2013\_merged\merged_data.csv",
    "data\_2014\_merged\merged_data.csv",
    "data\_2015\_merged\merged_data.csv",
    "data\_2016\_merged\merged_data.csv",
    "data\_2017\_merged\merged_data.csv",
    "data\_2018\_merged\merged_data.csv",
    "data\_2019\_merged\merged_data.csv",
    "data\_2020\_merged\merged_data.csv",
    "data\_2021\_merged\merged_data.csv",
    "data\_2022\_merged\merged_data.csv",
    "data\_2023\_merged\merged_data.csv",
    "data\_2024\_merged\merged_data.csv",
    ]  # Add more files if needed

# Read and merge files
df_list = [pd.read_csv(file) for file in csv_files]  # Read each CSV
new_merged_df = pd.concat(df_list, ignore_index=True)

In [7]:
new_merged_df

Unnamed: 0,latitude,longitude,max_temperature,mean_temperature,max_precipitation,mean_precipitation,mslp,month,day_of_year,year
0,18.80,72.8,300.55830,298.53610,0.000000e+00,0.000000e+00,101107.070,1,1,2010
1,19.05,72.8,301.07275,298.24356,0.000000e+00,0.000000e+00,101114.100,1,1,2010
2,18.80,72.8,301.43620,299.36893,3.810000e-07,1.590000e-08,101242.380,1,2,2010
3,19.05,72.8,301.72270,299.18018,0.000000e+00,0.000000e+00,101246.875,1,2,2010
4,18.80,72.8,300.03796,298.97437,1.910000e-07,7.940000e-09,101397.836,1,3,2010
...,...,...,...,...,...,...,...,...,...,...
10953,19.05,72.8,300.16570,298.23690,0.000000e+00,0.000000e+00,101466.620,12,364,2024
10954,18.80,72.8,300.97717,298.98740,0.000000e+00,0.000000e+00,101442.690,12,365,2024
10955,19.05,72.8,301.34230,299.03720,0.000000e+00,0.000000e+00,101448.500,12,365,2024
10956,18.80,72.8,300.60540,299.00990,0.000000e+00,0.000000e+00,101487.400,12,366,2024


In [None]:
# # Ensure valid_time is in datetime format
# new_merged_df["valid_time"] = pd.to_datetime(new_merged_df["valid_time"], format="%d-%m-%y", errors="coerce")

# # Extract numeric features from date
# new_merged_df["month"] = new_merged_df["valid_time"].dt.month  # Extract month (1-13)
# new_merged_df["day_of_year"] = new_merged_df["valid_time"].dt.dayofyear 
# new_merged_df["year"] = new_merged_df["valid_time"].dt.year         # Extract Year
#  # Extract day of year (1-365)

# # Drop the original datetime column
# new_merged_df.drop(columns=["valid_time"], inplace=True)

KeyError: 'valid_time'

In [9]:
if os.path.exists("data/_15year_without_labelled.csv"):
    print("Files already exist")
else:
    new_merged_df.to_csv("data/_15year_without_labelled.csv")
    print("File created")

File created


In [None]:
df = pd.read_csv("data/_15year_heat_labelled.csv")

In [None]:
# df = df.drop(columns=['Unnamed:0'], axis=1)
df

In [None]:
df = df.drop(columns=['Unnamed: 0'], axis=1)


In [None]:
X = df.drop(columns=['heatwave_label'], axis=1)
X

In [None]:
y = df['heatwave_label']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
models = {
    "LDA":LinearDiscriminantAnalysis(),
    "Lightgbm": LGBMClassifier(),
    "Ridge": RidgeClassifier(),
    "MLP": MLPClassifier(),
    "MultimodalNB": MultinomialNB(),
    "GaussianNB": GaussianNB(),
    "Bernoulli": BernoulliNB(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "SVM": SVC()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('\n')


In [None]:

pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
