In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split

In [8]:
df = pd.read_csv(r"data\labelled\santacruz_labelled_days.csv")

In [9]:
df

Unnamed: 0,DATE,DPT,WBT,DBT,Normal_Temp,Heatwave,Heatwave_Days
0,2010-01-01,16.0875,19.775,25.725000,30.4,0.0,0
1,2010-01-02,18.2000,21.375,26.975000,30.4,0.0,0
2,2010-01-03,19.7375,22.025,26.300000,30.4,0.0,0
3,2010-01-04,18.0250,20.450,24.575000,30.4,0.0,0
4,2010-01-05,18.5500,20.575,24.175000,30.4,0.0,0
...,...,...,...,...,...,...,...
5474,2024-12-27,13.4625,27.400,21.847186,31.9,0.0,0
5475,2024-12-28,13.4625,27.400,21.261927,31.9,0.0,0
5476,2024-12-29,13.4625,27.400,21.481941,31.9,0.0,0
5477,2024-12-30,13.4625,27.400,21.161551,31.9,0.0,0


In [10]:
df.isnull().sum()

DATE             0
DPT              0
WBT              0
DBT              0
Normal_Temp      0
Heatwave         0
Heatwave_Days    0
dtype: int64

In [11]:
heatwave_counts = df['Heatwave_Days'].value_counts()
heatwave_counts

Heatwave_Days
0    5417
1      62
Name: count, dtype: int64

In [12]:
target_col = ['Heatwave_Days']

In [13]:
features = ['DPT', 'WBT', 'DBT', 'Normal_Temp']
target = ['Heatwave_Days']

scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

In [14]:
# Convert to sequences for LSTM
sequence_length = 10
X, y = [], []
for i in range(len(df) - sequence_length):
    X.append(df[features].iloc[i:i+sequence_length].values)
    y.append(df[target].iloc[i+sequence_length])
X, y = np.array(X), np.array(y)

In [15]:
y

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [16]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
accuracies = []

for train_idx, test_idx in kfold.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

In [17]:
model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(sequence_length, len(features))),
        MaxPooling1D(pool_size=2),
        LSTM(50, return_sequences=True),
        LSTM(50),
        Dense(50, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [19]:
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)

Epoch 1/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.9769 - loss: 0.2475 - val_accuracy: 0.9890 - val_loss: 0.0640
Epoch 2/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9916 - loss: 0.0573 - val_accuracy: 0.9890 - val_loss: 0.0643
Epoch 3/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9867 - loss: 0.0808 - val_accuracy: 0.9890 - val_loss: 0.0622
Epoch 4/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9891 - loss: 0.0663 - val_accuracy: 0.9890 - val_loss: 0.0598
Epoch 5/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9878 - loss: 0.0739 - val_accuracy: 0.9890 - val_loss: 0.0603
Epoch 6/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9894 - loss: 0.0660 - val_accuracy: 0.9890 - val_loss: 0.0597
Epoch 7/20
[1m137/137[0m 

<keras.src.callbacks.history.History at 0x2849b322820>

In [20]:
loss, accuracy = model.evaluate(X_test, y_test)
accuracies.append(accuracy)
print(f"Fold {fold_no} Accuracy: {accuracy:.4f}")
fold_no += 1

# Print average accuracy
print(f"Average Cross-Validation Accuracy: {np.mean(accuracies):.4f}")


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9886 - loss: 0.0572
Fold 1 Accuracy: 0.9890
Average Cross-Validation Accuracy: 0.9890


In [22]:
model.save("lstm_cnn_model.h5")  # You can also use .keras format
print("model saved")



model saved


In [24]:
df = pd.read_csv(r"data\unlabelled\santacruz_combined_normal.csv")


In [25]:
df.columns = df.columns.str.strip()

# If there's no 'Month' column, extract from a 'Date' column (assuming format: YYYY-MM-DD)
if 'MN' not in df.columns:
    if 'DT' in df.columns:
        df['DT'] = pd.to_datetime(df['Date'])
        df['MN'] = df['Date'].dt.month
    else:
        raise ValueError("Month or Date column not found.")

In [28]:
df['Threshold'] = df['Normal_Temp'] + 4.5


In [29]:
def is_heatwave(dbt, threshold):
    return 1 if dbt >= 37.0 or dbt >= threshold else 0

df['Heatwave'] = df.apply(lambda row: is_heatwave(row['DBT'], row['Threshold']), axis=1)


In [30]:
df['Heatwave_Days'] = 0
consecutive_days = (df['Heatwave'] == 1) & (df['Heatwave'].shift(1) == 1)
df.loc[consecutive_days | consecutive_days.shift(-1, fill_value=False), 'Heatwave_Days'] = 1

In [32]:
output_path = r'data/labelled/santacruz_with_heatwaves.csv'
df.to_csv(output_path, index=False)

In [33]:
heatwave_counts_final = df['Heatwave_Days'].value_counts()
heatwave_counts_final

Heatwave_Days
0    43530
1       95
Name: count, dtype: int64