In [9]:
import pandas as pd
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pykalman import KalmanFilter
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [10]:
train = pd.read_csv('../data/Train.csv')
test = pd.read_csv('../data/Test.csv')
sub = pd.read_csv('../data/SampleSubmission.csv')

In [11]:
# //  Date variables

# train['burn_area'] = train.burn_area ** float(1/1024)
train['area'] = pd.to_numeric(train['ID'].apply(lambda x: x.split('_')[0]))
train['date'] = pd.to_datetime(train['ID'].apply(lambda x: x.split('_')[1]))
train['burn_area'] = pd.to_numeric(train['burn_area'], errors='coerce')

train['month'] = train.date.dt.month
train['year'] = train.date.dt.year
train['day'] = train.date.dt.weekday
train['quarter'] = train.date.dt.quarter
train['dayofmonth'] = train.date.dt.day
train['dayofyear'] = train.date.dt.dayofyear
train['weekofyear'] = train.date.dt.isocalendar().week
# train.set_index('date', inplace=True)

# // feature engineering features


# train['interaction_vpd_landcover1'] = train['climate_vpd'] * train['landcover_1']
# train['interaction_tmmx_vs'] = train['climate_tmmx'] * train['climate_vs']
# train['lag_climate_pr'] = train.groupby('area')['climate_pr'].shift(1)
# train['lag_climate_vpd'] = train.groupby('area')['climate_vpd'].shift(1)
# spatial_features = train[['lat', 'lon', 'elevation']].dropna()
# kmeans = KMeans(n_clusters=5, random_state=42).fit(spatial_features)
# train['spatial_cluster'] = kmeans.predict(train[['lat', 'lon', 'elevation']])
# train['burn_risk_index'] = (train['climate_vpd'] + train['climate_pdsi'] + train['climate_tmmx'] - train['climate_pr'])


# // Add lag featues to make it time series



In [12]:
# //  Date variables

# train['burn_area'] = train.burn_area ** float(1/2)
test['area'] = pd.to_numeric(test['ID'].apply(lambda x: x.split('_')[0]))
test['date'] = pd.to_datetime(test['ID'].apply(lambda x: x.split('_')[1]))

test['month'] = test.date.dt.month
test['year'] = test.date.dt.year
test['day'] = test.date.dt.weekday
test['quarter'] = test.date.dt.quarter
test['dayofmonth'] = test.date.dt.day
test['dayofyear'] = test.date.dt.dayofyear
test['weekofyear'] = test.date.dt.isocalendar().week
# train.set_index('date', inplace=True)

# // feature engineering features


# test['interaction_vpd_landcover1'] = test['climate_vpd'] * test['landcover_1']
# test['interaction_tmmx_vs'] = test['climate_tmmx'] * test['climate_vs']
# test['lag_climate_pr'] = test.groupby('area')['climate_pr'].shift(1)
# test['lag_climate_vpd'] = test.groupby('area')['climate_vpd'].shift(1)
# test['spatial_cluster'] = kmeans.predict(test[['lat', 'lon', 'elevation']])
# test['burn_risk_index'] = (test['climate_vpd'] + test['climate_pdsi'] + test['climate_tmmx'] - test['climate_pr'])


# // Add lag featues to make it time series



In [13]:
# 
# train['date'] = pd.to_datetime(train['ID'].apply(lambda x: x.split('_')[1]))
# train['month'] = train.date.dt.month
# train['year'] = train.date.dt.year
# train['area'] = pd.to_numeric(train['ID'].apply(lambda x: x.split('_')[0]))
train.set_index('date', inplace=True)

# # Apply same procedure on test set
# test['date'] = pd.to_datetime(test['ID'].apply(lambda x: x.split('_')[1]))
# test['area'] = pd.to_numeric(test['ID'].apply(lambda x: x.split('_')[0]))
test.set_index('date', inplace=True)

# Logically splitting data
X_train = train.loc[train.index.strftime('%Y-%m-%d') < '2013-01-01']
y_test = train.loc[train.index.strftime('%Y-%m-%d') >= '2013-01-01']

In [14]:
# from itertools import product
# import numpy as np

# # Define the parameter grid
# p = d = q = range(0, 3)
# pdq = list(product(p, d, q))
# seasonal_pdq = [(x[0], x[1], x[2], 12) for x in pdq]

# best_aic = np.inf
# best_params = None

# for param in pdq:
#     for param_seasonal in seasonal_pdq:
#         try:
#             mod = SARIMAX(train['burn_area'],
#                           order=param,
#                           seasonal_order=param_seasonal,
#                           enforce_stationarity=False,
#                           enforce_invertibility=False)
#             results = mod.fit()
#             if results.aic < best_aic:
#                 best_aic = results.aic
#                 best_params = (param, param_seasonal)
#         except:
#             continue

# print(f'Best SARIMA parameters: {best_params}')


In [15]:

preds = list()

for area in train['area'].unique():
    train_area = train.loc[train.area == area]
    test_area = test.loc[test.area == area]
    try:
        sarima_fit = SARIMAX(
            train_area['burn_area'],
            order=(2, 0, 0),
            seasonal_order=(2, 0, 1, 12)
        ).fit()
        forecast_months = len(test_area)
        sarimax_pred = sarima_fit.get_forecast(steps=forecast_months, freq='MS')
        predicted_values = sarimax_pred.predicted_mean

        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_kf = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')
        predicted_data_df = pd.DataFrame(
            {
                'ID': str(area)+'_'+test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )

    except Exception as e:
        model_arima= auto_arima(
            train_area['burn_area'], 
            m=12,
            seasonal=True,
            start_p=0,
            start_q=0,
            max_order=5,
            test='adf',
            error_action= 'ignore',
            suppress_warnings=True,
            stepwise= True, trace= True
        )

        forecast_months = len(test_area)
        predicted_values = model_arima.predict(n_periods=forecast_months)
        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_kf = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')

        predicted_data_df = pd.DataFrame(
            {
                'ID': str(area) + '_' + test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )
    preds.append(predicted_data_df)

In [17]:
# Univariate SARIMAX with train-test split and Kalman Filter applied to the predictions
# and Auto ARIMA for areas where SARIMAX returns an error
import numpy as np
from sklearn.metrics import mean_squared_error
# Initialize an empty list to RMSE values for each area
rmse_values = []
predictions = []

# Loop through each area ID
for real_id in train['area'].unique():
    # Filter the train and test data for the current area ID
    train_area = train.loc[train.area == real_id]
    test_area = test.loc[test.area == real_id]

    # Train Test Split
    X_train_area = train_area.loc[train_area.index.strftime('%Y-%m-%d') < '2011-01-01']
    y_test_area = train_area.loc[train_area.index.strftime('%Y-%m-%d') >= '2011-01-01']

    try:
        # Auto ARIMA model tuning
        model_arima = auto_arima(X_train_area['burn_area'], m=12, seasonal=True,
                                  start_p=0, start_q=0, max_order=5, test='adf', error_action='ignore',
                                  suppress_warnings=True, stepwise=True, trace=False)
        # Set best order as the values obtained via auto ARIMA
        non_seasonal_order = model_arima.order
        best_seasonal_order = model_arima.seasonal_order

    except Exception as e:
        print(f"Failed to tune real_id {real_id}: {e}. Falling back to default seasonal and non-seasonal order")
        # Default best order terms
        non_seasonal_order = (2, 0, 0)
        best_seasonal_order = (2, 0, 1, 12)

    try:
        # Fit SARIMA model with the best orders on train set
        sarimax_model = SARIMAX(X_train_area['burn_area'],
                                order=non_seasonal_order,
                                seasonal_order=best_seasonal_order)
        sarima_fit = sarimax_model.fit()

        # Forecast for the specific number of months
        forecast_months = len(y_test_area)

        # Perform the forecast
        sarimax_pred = sarima_fit.get_forecast(steps=forecast_months, freq='MS')

        # Extract predicted values
        predicted_values = sarimax_pred.predicted_mean

        # Apply Kalman Filter to smooth the predictions
        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_df = pd.Series(predicted_values_kf.flatten(), index=y_test_area.index, name='burn_area')

    except Exception as e:
        print(f"Failed to model real_id {real_id}: {e}. Generating ARIMA forecast")
        try:
            # Auto ARIMA model tuning
            model_arima = auto_arima(X_train_area['burn_area'], m=12, seasonal=True,
                                      start_p=0, start_q=0, max_order=5, test='adf', error_action='ignore',
                                      suppress_warnings=True, stepwise=True, trace=False)
            forecast_length = len(y_test_area)
            predicted_values = model_arima.predict(n_periods=forecast_length)
            # Apply Kalman Filter to smooth the predictions
            kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
            predicted_values_kf, _ = kf.filter(predicted_values.values)
            predicted_data_df = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')

        except Exception as e:
            print(f"Failed to model real_id {real_id}: {e}. Falling back to zero prediction.")
            # Fall back to simple zero prediction
            predicted_data_df = pd.Series(np.zeros(len(y_test_area)), index=y_test_area.index)

    # Ensure alignment
    predicted_data_df = predicted_data_df.reindex(y_test_area.index, method='nearest')
    predicted_data_df = pd.DataFrame(
            {
                'ID': str(area) + '_' + test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )
    predictions.append(predicted_data_df)

    # Calculate RMSE for the current area ID
    rmse_area = np.sqrt(mean_squared_error(y_test_area['burn_area'], predicted_data_df))
    rmse_values.append(rmse_area)

# Calculate the overall RMSE by averaging the RMSE values of all areas
if rmse_values:
    overall_rmse = np.mean(rmse_values)
    print('Overall Test RMSE:', overall_rmse)
else:
    print('No valid RMSE values were calculated')

In [None]:
df_combine = pd.DataFrame(columns=['ID', 'burn_area'])
for n in range(48):
    df = pd.DataFrame(columns=['ID', 'burn_area'])
    for i in range(len(predictions)):
        df = pd.concat([df, predictions[i].loc[predictions[n].index == n]], ignore_index=True)
    df_combine = pd.concat([df_combine, df], ignore_index=True)
sub['burn_area'] = df_combine['burn_area']
sub['burn_area'] = sub['burn_area'].clip(0, 1)
sub.to_csv('../submissions/submit_02.csv', index=False)

In [16]:
df_combine = pd.DataFrame(columns=['ID', 'burn_area'])
for n in range(48):
    df = pd.DataFrame(columns=['ID', 'burn_area'])
    for i in range(len(preds)):
        df = pd.concat([df, preds[i].loc[preds[n].index == n]], ignore_index=True)
    df_combine = pd.concat([df_combine, df], ignore_index=True)
sub['burn_area'] = df_combine['burn_area']
sub['burn_area'] = sub['burn_area'].clip(0, 1)
sub.to_csv('../submissions/submit_02.csv', index=False)