In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(r"data\unlabelled\santacruz_cleaned.csv")
df

In [None]:
normals = {
    1: 30.4, 2: 31.1, 3: 32.6, 4: 33.0, 5: 33.3, 6: 32.0,
    7: 29.9, 8: 29.5, 9: 30.2, 10: 33.1, 11: 33.4, 12: 31.9
}
df.insert(loc=8, column='Normal_Temp', value=df['MN'].map(normals))

In [None]:
df

In [None]:
df.to_csv(r"data\unlabelled\santacuz_with_normal.csv")

In [None]:
daily_max = df.groupby(['YEAR', 'MN', 'DT'])['DBT'].max().reset_index()
thresholds = {month: normal + 4.5 for month, normal in normals.items()}

# Function to determine heatwave label
def is_heatwave(dbt, month):
    if dbt >= 37.0 or dbt >= thresholds[month]:
        return 1
    return 0

# Merge daily max DBT back to original dataframe
df = df.merge(daily_max[['YEAR', 'MN', 'DT', 'DBT']], on=['YEAR', 'MN', 'DT'], suffixes=('', '_max'))
df['Heatwave'] = df.apply(lambda row: is_heatwave(row['DBT_max'], row['MN']), axis=1)
df = df.drop(columns=['DBT_max'])  # Remove temporary max column
print(thresholds)

In [None]:
df

In [None]:
heatwave_counts = df['Heatwave'].value_counts()
heatwave_counts

In [None]:
if os.path.exists(r"data\labelled\santacruz_labelled.csv"):
    print("Labelled processed data exists")
else:
    df.to_csv(r"data\labelled\santacruz_labelled.csv", index= False)
    print("Labelled processed data created")

In [None]:
df['DATE'] = pd.to_datetime(df[['YEAR', 'MN', 'DT']].rename(columns={'MN': 'month', 'DT': 'day'}))
df.set_index('DATE', inplace=True)
df

In [None]:
df.drop(columns=['INDEX', 'YEAR', 'MN', 'DT', 'HR'], inplace=True)


In [None]:
daily_df = df.resample('D').mean()


In [None]:
if os.path.exists(r"data\labelled\santacruz_labelled_daily.csv"):
    print("Labelled processed data exists")
else:
    daily_df.to_csv(r"data\labelled\santacruz_labelled_daily.csv", index= False)
    print("Labelled processed data created")

In [None]:
df_daily = pd.read_csv(r"data\labelled\santacruz_labelled_daily.csv")

In [None]:
df_daily

In [None]:
df_daily.isnull().sum()

In [None]:
# Create a proper datetime column from YEAR, MN (month), and DT (day)
df['Date'] = pd.to_datetime(df[['YEAR', 'MN', 'DT']].rename(columns={'YEAR': 'year', 'MN': 'month', 'DT': 'day'}))

# Group by Date to check daily heatwave occurrences (since multiple entries exist per day)
daily_heatwave = df.groupby('Date')['Heatwave'].max().reset_index()

In [None]:
daily_heatwave['heatwave_days'] = 0
consecutive_days = (daily_heatwave['Heatwave'] == 1) & (daily_heatwave['Heatwave'].shift(1) == 1)

# Assign 1 to the identified consecutive heatwave days
daily_heatwave.loc[consecutive_days | consecutive_days.shift(-1, fill_value=False), 'heatwave_days'] = 1

In [None]:
df_days = df.merge(daily_heatwave[['Date', 'heatwave_days']], on='Date', how='left')

# Drop temporary Date column
df_days.drop(columns=['Date'], inplace=True)


In [None]:
df_days

In [None]:
df_days.to_csv(r"data\labelled\santacruz_labelled_days.csv", index= False)