In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r"data\unlabelled\santacruz_cleaned.csv")
df

Unnamed: 0,YEAR,HR,DT,MN,DPT,WBT,DBT
0,2010,0,1,1,14.8,17.4,21.20000
1,2010,0,2,1,17.8,19.4,22.00000
2,2010,0,3,1,20.0,21.4,24.00000
3,2010,0,4,1,16.1,18.6,22.40000
4,2010,0,5,1,17.8,18.4,19.40000
...,...,...,...,...,...,...,...
41052,2024,84,27,12,,,22.60570
41053,2024,84,28,12,,,23.04580
41054,2024,84,29,12,,,21.40127
41055,2024,84,30,12,,,23.43057


In [3]:
normals = {
    1: 30.4, 2: 31.1, 3: 32.6, 4: 33.0, 5: 33.3, 6: 32.0,
    7: 29.9, 8: 29.5, 9: 30.2, 10: 33.1, 11: 33.4, 12: 31.9
}
df.insert(loc=7, column='Normal_Temp', value=df['MN'].map(normals))

In [4]:
df

Unnamed: 0,YEAR,HR,DT,MN,DPT,WBT,DBT,Normal_Temp
0,2010,0,1,1,14.8,17.4,21.20000,30.4
1,2010,0,2,1,17.8,19.4,22.00000,30.4
2,2010,0,3,1,20.0,21.4,24.00000,30.4
3,2010,0,4,1,16.1,18.6,22.40000,30.4
4,2010,0,5,1,17.8,18.4,19.40000,30.4
...,...,...,...,...,...,...,...,...
41052,2024,84,27,12,,,22.60570,31.9
41053,2024,84,28,12,,,23.04580,31.9
41054,2024,84,29,12,,,21.40127,31.9
41055,2024,84,30,12,,,23.43057,31.9


In [5]:
df.to_csv(r"data\unlabelled\santacuz_with_normal.csv")

In [7]:
daily_max = df.groupby(['YEAR', 'MN', 'DT'])['DBT'].max().reset_index()
thresholds = {month: normal + 4.5 for month, normal in normals.items()}

# Function to determine heatwave label
def is_heatwave(dbt, month):
    if dbt >= 37.0 or dbt >= thresholds[month]:
        return 1
    return 0

# Merge daily max DBT back to original dataframe
df = df.merge(daily_max[['YEAR', 'MN', 'DT', 'DBT']], on=['YEAR', 'MN', 'DT'], suffixes=('', '_max'))
df['Heatwave'] = df.apply(lambda row: is_heatwave(row['DBT_max'], row['MN']), axis=1)
df = df.drop(columns=['DBT_max'])  # Remove temporary max column
print(thresholds)

{1: 34.9, 2: 35.6, 3: 37.1, 4: 37.5, 5: 37.8, 6: 36.5, 7: 34.4, 8: 34.0, 9: 34.7, 10: 37.6, 11: 37.9, 12: 36.4}


In [8]:
df

Unnamed: 0,YEAR,HR,DT,MN,DPT,WBT,DBT,Normal_Temp,Heatwave
0,2010,0,1,1,14.8,17.4,21.20000,30.4,0
1,2010,0,2,1,17.8,19.4,22.00000,30.4,0
2,2010,0,3,1,20.0,21.4,24.00000,30.4,0
3,2010,0,4,1,16.1,18.6,22.40000,30.4,0
4,2010,0,5,1,17.8,18.4,19.40000,30.4,0
...,...,...,...,...,...,...,...,...,...
41052,2024,84,27,12,,,22.60570,31.9,0
41053,2024,84,28,12,,,23.04580,31.9,0
41054,2024,84,29,12,,,21.40127,31.9,0
41055,2024,84,30,12,,,23.43057,31.9,0


In [9]:
heatwave_counts = df['Heatwave'].value_counts()
heatwave_counts

Heatwave
0    40272
1      785
Name: count, dtype: int64

In [10]:
if os.path.exists(r"data\labelled\santacruz_labelled.csv"):
    print("Labelled processed data exists")
else:
    df.to_csv(r"data\labelled\santacruz_labelled.csv", index= False)
    print("Labelled processed data created")

Labelled processed data exists


In [11]:
df['DATE'] = pd.to_datetime(df[['YEAR', 'MN', 'DT']].rename(columns={'MN': 'month', 'DT': 'day'}))
df.set_index('DATE', inplace=True)
df

Unnamed: 0_level_0,YEAR,HR,DT,MN,DPT,WBT,DBT,Normal_Temp,Heatwave
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-01,2010,0,1,1,14.8,17.4,21.20000,30.4,0
2010-01-02,2010,0,2,1,17.8,19.4,22.00000,30.4,0
2010-01-03,2010,0,3,1,20.0,21.4,24.00000,30.4,0
2010-01-04,2010,0,4,1,16.1,18.6,22.40000,30.4,0
2010-01-05,2010,0,5,1,17.8,18.4,19.40000,30.4,0
...,...,...,...,...,...,...,...,...,...
2024-12-27,2024,84,27,12,,,22.60570,31.9,0
2024-12-28,2024,84,28,12,,,23.04580,31.9,0
2024-12-29,2024,84,29,12,,,21.40127,31.9,0
2024-12-30,2024,84,30,12,,,23.43057,31.9,0


In [12]:
df.drop(columns=['YEAR', 'MN', 'DT', 'HR'], inplace=True)


In [13]:
daily_df = df.resample('D').mean()


In [18]:
if os.path.exists(r"data\labelled\santacruz_labelled_daily.csv"):
    print("Labelled processed data exists")
else:
    daily_df.to_csv(r"data\labelled\santacruz_labelled_daily.csv")
    print("Labelled processed data created")

Labelled processed data created


In [25]:
daily_df

Unnamed: 0_level_0,DPT,WBT,DBT,Normal_Temp,Heatwave
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,16.0875,19.775,25.725000,30.4,0.0
2010-01-02,18.2000,21.375,26.975000,30.4,0.0
2010-01-03,19.7375,22.025,26.300000,30.4,0.0
2010-01-04,18.0250,20.450,24.575000,30.4,0.0
2010-01-05,18.5500,20.575,24.175000,30.4,0.0
...,...,...,...,...,...
2024-12-27,,,21.847186,31.9,0.0
2024-12-28,,,21.261927,31.9,0.0
2024-12-29,,,21.481941,31.9,0.0
2024-12-30,,,21.161551,31.9,0.0


In [26]:
daily_df.isnull().sum()

DPT            725
WBT            725
DBT              9
Normal_Temp      9
Heatwave         9
dtype: int64

In [38]:
daily_df = daily_df.interpolate()


In [39]:
daily_df.isnull().sum()

DPT            0
WBT            0
DBT            0
Normal_Temp    0
Heatwave       0
dtype: int64

In [40]:
heatwave_counts_final = daily_df['Heatwave'].value_counts()
heatwave_counts_final

Heatwave
0.0    5379
1.0     100
Name: count, dtype: int64

In [55]:
daily_df.to_csv(r"data\labelled\santacruz_labelled_daily.csv")

In [56]:
new_df = pd.read_csv(r"data\labelled\santacruz_labelled_daily.csv")
new_df

Unnamed: 0,DATE,DPT,WBT,DBT,Normal_Temp,Heatwave
0,2010-01-01,16.0875,19.775,25.725000,30.4,0.0
1,2010-01-02,18.2000,21.375,26.975000,30.4,0.0
2,2010-01-03,19.7375,22.025,26.300000,30.4,0.0
3,2010-01-04,18.0250,20.450,24.575000,30.4,0.0
4,2010-01-05,18.5500,20.575,24.175000,30.4,0.0
...,...,...,...,...,...,...
5474,2024-12-27,13.4625,27.400,21.847186,31.9,0.0
5475,2024-12-28,13.4625,27.400,21.261927,31.9,0.0
5476,2024-12-29,13.4625,27.400,21.481941,31.9,0.0
5477,2024-12-30,13.4625,27.400,21.161551,31.9,0.0


In [57]:
new_df.isnull().sum()

DATE           0
DPT            0
WBT            0
DBT            0
Normal_Temp    0
Heatwave       0
dtype: int64

In [58]:
new_df['Heatwave_Days'] = 0
consecutive_days = (new_df['Heatwave'] == 1) & (new_df['Heatwave'].shift(1) == 1)

# Assign 1 to the identified consecutive heatwave days
new_df.loc[consecutive_days | consecutive_days.shift(-1, fill_value=False), 'Heatwave_Days'] = 1

In [59]:
new_df

Unnamed: 0,DATE,DPT,WBT,DBT,Normal_Temp,Heatwave,Heatwave_Days
0,2010-01-01,16.0875,19.775,25.725000,30.4,0.0,0
1,2010-01-02,18.2000,21.375,26.975000,30.4,0.0,0
2,2010-01-03,19.7375,22.025,26.300000,30.4,0.0,0
3,2010-01-04,18.0250,20.450,24.575000,30.4,0.0,0
4,2010-01-05,18.5500,20.575,24.175000,30.4,0.0,0
...,...,...,...,...,...,...,...
5474,2024-12-27,13.4625,27.400,21.847186,31.9,0.0,0
5475,2024-12-28,13.4625,27.400,21.261927,31.9,0.0,0
5476,2024-12-29,13.4625,27.400,21.481941,31.9,0.0,0
5477,2024-12-30,13.4625,27.400,21.161551,31.9,0.0,0


In [61]:
heatwave_counts_days = new_df['Heatwave_Days'].value_counts()
heatwave_counts_days

Heatwave_Days
0    5417
1      62
Name: count, dtype: int64

In [62]:
new_df.to_csv(r"data\labelled\santacruz_labelled_days.csv", index = False)