In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')


In [9]:
imd_df = pd.read_csv("data\processed_imd.csv")

In [10]:
imd_df

Unnamed: 0,YEAR,MN,HR,DT,MSLP,DBT,WBT,DPT,RH
0,2010,1,0,1,1010.1,21.2,17.4,14.8,67
1,2010,1,0,2,1011.2,22.0,19.4,17.8,77
2,2010,1,0,3,1013.2,24.0,21.4,20.0,78
3,2010,1,0,4,1013.7,22.4,18.6,16.1,68
4,2010,1,0,5,1011.0,19.4,18.4,17.8,90
...,...,...,...,...,...,...,...,...,...
35457,2024,12,60,27,0.0,25.4,27.4,22.1,82
35458,2024,12,60,28,0.0,25.4,27.4,22.1,82
35459,2024,12,60,29,0.0,26.0,27.4,20.2,70
35460,2024,12,60,30,0.0,26.2,27.4,22.0,78


In [23]:
# Compute Climatological Mean Temperature (DBT) for each month
climatology_mean = imd_df.groupby("MN")["DBT"].mean()

# Step 2: Function to label heatwave days (Binary Classification)
def label_heatwave(row):
    mean_temp = climatology_mean[row["MN"]]  # Monthly mean temperature
    anomaly = row["DBT"] - mean_temp  # Temperature anomaly

    # IMD Heatwave Criteria
    if row["DBT"] >= 37 or anomaly >= 4.5:
        return 1  # Heatwave
    return 0  # No Heatwave

# Step 3: Apply binary heatwave labeling
imd_df["Heatwave_Label"] = imd_df.apply(label_heatwave, axis=1)

# Step 4: Ensure at least 2 consecutive heatwave days
imd_df["Consecutive_Heatwave"] = (imd_df["Heatwave_Label"] == 1).astype(int)
imd_df["Consecutive_Heatwave"] = imd_df["Consecutive_Heatwave"] * (imd_df["Consecutive_Heatwave"]
                                            .groupby((imd_df["Consecutive_Heatwave"] == 0).cumsum())
                                            .cumcount() + 1)

# Filter: Only keep heatwaves that last for at least 2 days
imd_df["Final_Heatwave_Label"] = (imd_df["Consecutive_Heatwave"] >= 2).astype(int)



In [24]:
imd_df

Unnamed: 0,YEAR,MN,HR,DT,MSLP,DBT,WBT,DPT,RH,Heatwave_Label,Consecutive_Heatwave,Final_Heatwave_Label
0,2010,1,0,1,1010.1,21.2,17.4,14.8,67,0,0,0
1,2010,1,0,2,1011.2,22.0,19.4,17.8,77,0,0,0
2,2010,1,0,3,1013.2,24.0,21.4,20.0,78,0,0,0
3,2010,1,0,4,1013.7,22.4,18.6,16.1,68,0,0,0
4,2010,1,0,5,1011.0,19.4,18.4,17.8,90,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
35457,2024,12,60,27,0.0,25.4,27.4,22.1,82,0,0,0
35458,2024,12,60,28,0.0,25.4,27.4,22.1,82,0,0,0
35459,2024,12,60,29,0.0,26.0,27.4,20.2,70,0,0,0
35460,2024,12,60,30,0.0,26.2,27.4,22.0,78,0,0,0


In [25]:
if os.path.exists("data\labelled_processed_imd.csv"):
    print("Labelled processed data exists")
else:
    imd_df.to_csv("data\labelled_processed_imd.csv", index= False)
    print("Labelled processed data created")

Labelled processed data created
