In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [59]:
df = pd.read_csv('engineered_wildfire_data.csv')

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450342 entries, 0 to 450341
Data columns (total 32 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   temp        450342 non-null  float64
 1   rh          450342 non-null  float64
 2   ws          450342 non-null  float64
 3   wd          450342 non-null  int64  
 4   pcp         450342 non-null  float64
 5   ffmc        450342 non-null  float64
 6   dmc         450342 non-null  float64
 7   dc          450342 non-null  float64
 8   isi         450342 non-null  float64
 9   bui         450342 non-null  float64
 10  fwi         450342 non-null  float64
 11  ros         450339 non-null  float64
 12  sfc         450339 non-null  float64
 13  tfc         450339 non-null  float64
 14  bfc         239132 non-null  float64
 15  hfi         450339 non-null  float64
 16  cfb         450339 non-null  float64
 17  pcuring     425990 non-null  float64
 18  greenup     425990 non-null  float64
 19  el

In [38]:
df.head()

Unnamed: 0,temp,rh,ws,wd,pcp,ffmc,dmc,dc,isi,bui,...,tfc0,sfc0,year,month,day,lat_sin,lat_cos,lon_sin,lon_cos,year_month
0,-1.006741,2.094934,-0.649149,320,0.43,82.976,30.078,161.161,2.68,41.018,...,0.35,0.35,2020,6,2,0.883899,0.467678,-0.311904,-0.950114,2020-6
1,0.423696,0.99279,-0.700308,145,1.237,68.466,0.0,294.02,0.977,0.0,...,0.1,0.1,2020,6,11,0.87989,0.475177,-0.268096,-0.963392,2020-6
2,0.226618,1.122454,-0.86797,30,0.591,88.685,55.743,202.448,5.536,66.032,...,1.36,1.36,2020,6,20,0.736971,0.675925,-0.939322,0.343037,2020-6
3,1.366794,-1.665321,-0.84315,271,0.0,98.652,290.568,841.23,22.181,311.848,...,0.35,0.35,2020,6,22,0.535709,0.844403,-0.934801,-0.355172,2020-6
4,1.170139,0.020311,-0.000958,50,0.001,91.66,18.664,102.62,10.972,25.66,...,0.35,0.35,2020,6,13,0.551529,0.834155,-0.999729,-0.023267,2020-6


In [60]:
df.isnull().sum()

temp               0
rh                 0
ws                 0
wd                 0
pcp                0
ffmc               0
dmc                0
dc                 0
isi                0
bui                0
fwi                0
ros                3
sfc                3
tfc                3
bfc           211210
hfi                3
cfb                3
pcuring        24352
greenup        24352
elev               0
sfl                0
cfl                0
tfc0               3
sfc0               3
year               0
month              0
day                0
lat_sin            0
lat_cos            0
lon_sin            0
lon_cos            0
year_month         0
dtype: int64

In [61]:
df = df.sort_values(['year', 'month', 'day'], ascending=True)

In [62]:
df.fillna(method='ffill', inplace=True)

In [63]:
primary_features = [
    'temp',    # Temperature
    'rh',      # Relative Humidity
    'ws',      # Wind Speed
    'wd',      # Wind Direction
    'ffmc',    # Fine Fuel Moisture Code
    'dmc',     # Duff Moisture Code
    'dc',      # Drought Code
    'isi',     # Initial Spread Index
    'bui',     # Buildup Index
    'fwi',     # Fire Weather Index
    'ros',     # Rate of Spread
    'sfc',     # Surface Fuel Consumption
    'tfc',     # Total Fuel Consumption
    'hfi',     # Head Fire Intensity
    'pcuring', # Percent Curing (if available)
    'elev',    # Elevation
    'sfl',     # Surface Fuel Load (if this represents available surface fuels)
    'cfl'      # Crown Fuel Load
]

In [64]:
# Time-based features
time_features = ['year', 'month', 'day']
# Note: 'year_month' is omitted as it's likely redundant with 'year' and 'month'

# Derived features
derived_features = [
    'temp_lag_1',     # Temperature from previous time step
    'ws_lag_1',       # Wind speed from previous time step
    'rh_lag_1',       # Relative humidity from previous time step
    'ffmc_lag_1',     # FFMC from previous time step
    'isi_lag_1',      # ISI from previous time step
    'fwi_lag_1',      # FWI from previous time step
    'drought_index',  # Composite drought index (e.g., (dc + dmc) / 2)
    'day_of_year',    # Calculated from year, month, day
    'season'          # Derived from month (e.g., Spring, Summer, Fall, Winter)
]

In [65]:
# Combine all features
all_features = primary_features + time_features + derived_features

In [76]:
from transformers import AutoformerConfig, AutoformerModel

config = AutoformerConfig(
    prediction_length=24,  # Adjust based on your forecasting needs
    context_length=72,  # Typically 3x prediction_length
    input_size=len(all_features),
    lags_sequence=[1, 2, 3, 4, 5, 6, 7],
    num_time_features=len(time_features) + 2,  # year, month, day, day_of_year, season
    num_static_real_features=1,  # Only elevation is static
    num_dynamic_real_features=len(all_features) - len(time_features) - 3,  # Subtracting time features and elevation
    d_model=64,
    encoder_layers=2,
    decoder_layers=2,
    encoder_attention_heads=2,
    decoder_attention_heads=2,
    encoder_ffn_dim=128,
    decoder_ffn_dim=128,
)

model = AutoformerModel(config)

In [67]:
def derive_features(df):
    # Ensure datetime index
    df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-' + df['day'].astype(str))
    df.set_index('date', inplace=True)
    
    # Create day of year
    df['day_of_year'] = df.index.dayofyear
    
    # Create season
    df['season'] = pd.cut(df['month'], bins=[0, 3, 6, 9, 12], labels=['Winter', 'Spring', 'Summer', 'Fall'])
    
    # Create lag features
    for feature in ['temp', 'ws', 'rh', 'ffmc', 'isi', 'fwi']:
        df[f'{feature}_lag_1'] = df[feature].shift(1)
    
    # Create drought index
    df['drought_index'] = (df['dc'] + df['dmc']) / 2
    
    return df

# Apply the function to your dataframe
df = derive_features(df)

In [72]:
final_df = pd.concat([df[all_features], df['cfb']], axis=1)

In [75]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def prepare_data(df, target_column='cfb', test_size=0.2, sequence_length=30):
    # Ensure the dataframe is sorted by date
    df = df.sort_index()

    # Separate the target column from the features
    features = df.drop(columns=[target_column])
    target = df[target_column]

    # Split the data into training and testing sets
    train_features, test_features, train_target, test_target = train_test_split(
        features, target, test_size=test_size, shuffle=False
    )

    # Identify categorical and numerical features
    categorical_features = ['season']
    numeric_features = train_features.select_dtypes(include=['float64', 'int64']).columns.tolist()

    # Create the preprocessing pipelines for both numeric and categorical data
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Combine the transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Apply the transformations to train and test data
    train_features_scaled = preprocessor.fit_transform(train_features)
    test_features_scaled = preprocessor.transform(test_features)

    # Convert the transformed data back to DataFrame
    train_features_scaled_df = pd.DataFrame(train_features_scaled, index=train_features.index)
    test_features_scaled_df = pd.DataFrame(test_features_scaled, index=test_features.index)

    # Add the target column back to the scaled data
    train_scaled = train_features_scaled_df.copy()
    train_scaled[target_column] = train_target
    test_scaled = test_features_scaled_df.copy()
    test_scaled[target_column] = test_target

    # Function to create sequences
    def create_sequences(data, target_column, sequence_length):
        X, y = [], []
        for i in range(len(data) - sequence_length):
            X.append(data.iloc[i:i+sequence_length].drop(columns=[target_column]).values)
            y.append(data.iloc[i+sequence_length][target_column])
        return np.array(X), np.array(y)
    
    # Create sequences
    X_train, y_train = create_sequences(train_scaled, target_column, sequence_length)
    X_test, y_test = create_sequences(test_scaled, target_column, sequence_length)
    
    return X_train, y_train, X_test, y_test, scaler

# Prepare the data
X_train, y_train, X_test, y_test, scaler = prepare_data(final_df)

In [77]:
from torch.utils.data import DataLoader, TensorDataset
import torch

# Convert data to PyTorch tensors and create DataLoaders
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# # Train the model
# from torch.optim import Adam
# from torch.nn import MSELoss
# import torch

# optimizer = Adam(model.parameters())
# loss_function = MSELoss()

# num_epochs = 100
# batch_size = 32

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0
#     for i in range(0, len(X_train), batch_size):
#         batch_X = torch.FloatTensor(X_train[i:i+batch_size])
#         batch_y = torch.FloatTensor(y_train[i:i+batch_size])
        
#         optimizer.zero_grad()
#         outputs = model(batch_X)
#         loss = loss_function(outputs, batch_y.unsqueeze(1))
#         loss.backward()
#         optimizer.step()
        
#         total_loss += loss.item()
    
#     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(X_train)}")

In [78]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import pytorch_lightning as pl

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=True, mode='min')
checkpoint_callback = ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')

# Initialize the PyTorch Lightning trainer
trainer = pl.Trainer(
    max_epochs=50,
    callbacks=[early_stopping, checkpoint_callback]
)

# Train the model
trainer.fit(model, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\annma\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `AutoformerModel`