In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
import xgboost as xgb
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [3]:
df = pd.read_csv(r"data\labelled\santacruz_labelled_days.csv")

In [5]:
df

Unnamed: 0,DATE,DPT,WBT,DBT,Normal_Temp,Heatwave,Heatwave_Days
0,2010-01-01,16.0875,19.775,25.725000,30.4,0.0,0
1,2010-01-02,18.2000,21.375,26.975000,30.4,0.0,0
2,2010-01-03,19.7375,22.025,26.300000,30.4,0.0,0
3,2010-01-04,18.0250,20.450,24.575000,30.4,0.0,0
4,2010-01-05,18.5500,20.575,24.175000,30.4,0.0,0
...,...,...,...,...,...,...,...
5474,2024-12-27,13.4625,27.400,21.847186,31.9,0.0,0
5475,2024-12-28,13.4625,27.400,21.261927,31.9,0.0,0
5476,2024-12-29,13.4625,27.400,21.481941,31.9,0.0,0
5477,2024-12-30,13.4625,27.400,21.161551,31.9,0.0,0


In [6]:
scaler = StandardScaler()
numeric_cols = ['DPT', 'WBT', 'DBT', 'Normal_Temp']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [7]:
X = df.drop(columns=['Heatwave_Days'])
y = df['Heatwave_Days']

In [8]:
ts_split = TimeSeriesSplit(n_splits=5)

In [9]:
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
for train_idx, val_idx in ts_split.split(X):
    xgb_model.fit(X.iloc[train_idx], y.iloc[train_idx])

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:DATE: object

In [11]:
xgb_features = xgb_model.apply(X)
X_combined = np.hstack((X.values, xgb_features))

In [12]:
X_lstm = X_combined.reshape((X_combined.shape[0], 1, X_combined.shape[1]))


In [13]:
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(1, X_combined.shape[1])),
    Dropout(0.2),
    LSTM(25, return_sequences=False),
    Dense(1, activation='sigmoid')
])

  super().__init__(**kwargs)


In [14]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [15]:
train_size = int(0.8 * len(X_lstm))
X_train, X_test = X_lstm[:train_size], X_lstm[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [16]:
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/20
[1m882/882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9822 - loss: 0.1903 - val_accuracy: 0.9671 - val_loss: 0.1502
Epoch 2/20
[1m882/882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9808 - loss: 0.0963 - val_accuracy: 0.9671 - val_loss: 0.1528
Epoch 3/20
[1m882/882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9833 - loss: 0.0863 - val_accuracy: 0.9671 - val_loss: 0.1511
Epoch 4/20
[1m882/882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9820 - loss: 0.0904 - val_accuracy: 0.9671 - val_loss: 0.1542
Epoch 5/20
[1m882/882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9823 - loss: 0.0894 - val_accuracy: 0.9671 - val_loss: 0.1507
Epoch 6/20
[1m882/882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9827 - loss: 0.0882 - val_accuracy: 0.9671 - val_loss: 0.1513
Epoch 7/20
[1m882/882[0m 

<keras.src.callbacks.history.History at 0x1ecba568850>

In [17]:
eval_results = model.evaluate(X_test, y_test)
print("Test Loss:", eval_results[0])
print("Test Accuracy:", eval_results[1])

[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9586 - loss: 0.1869
Test Loss: 0.15181483328342438
Test Accuracy: 0.9671201705932617
