In [9]:
import pandas as pd
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pykalman import KalmanFilter
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [10]:
train = pd.read_csv('../data/Train.csv')
test = pd.read_csv('../data/Test.csv')
sub = pd.read_csv('../data/SampleSubmission.csv')

In [19]:
# 
train['date'] = pd.to_datetime(train['ID'].apply(lambda x: x.split('_')[1]))
train['month'] = train.date.dt.month
train['year'] = train.date.dt.year
train['area'] = pd.to_numeric(train['ID'].apply(lambda x: x.split('_')[0]))
train.set_index('date', inplace=True)

# Apply same procedure on test set
test['date'] = pd.to_datetime(test['ID'].apply(lambda x: x.split('_')[1]))
test['area'] = pd.to_numeric(test['ID'].apply(lambda x: x.split('_')[0]))
test.set_index('date', inplace=True)

# Logically splitting data
X_train = train.loc[train.index.strftime('%Y-%m-%d') < '2013-01-01']
y_test = train.loc[train.index.strftime('%Y-%m-%d') >= '2013-01-01']

In [15]:

preds = list()

for area in train['area'].unique():
    train_area = train.loc[train.area == area]
    test_area = test.loc[test.area == area]
    try:
        sarima_fit = SARIMAX(
            train_area['burn_area'],
            order=(2, 0, 0),
            seasonal_order=(2, 0, 1, 12)
        ).fit()
        forecast_months = len(test_area)
        sarimax_pred = sarima_fit.get_forecast(steps=forecast_months, freq='MS')
        predicted_values = sarimax_pred.predicted_mean

        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_kf = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')
        predicted_data_df = pd.DataFrame(
            {
                'ID': str(area)+'_'+test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )

    except Exception as e:
        model_arima= auto_arima(
            train_area['burn_area'], 
            m=12,
            seasonal=True,
            start_p=0,
            start_q=0,
            max_order=5,
            test='adf',
            error_action= 'ignore',
            suppress_warnings=True,
            stepwise= True, trace= True
        )

        forecast_months = len(test_area)
        predicted_values = model_arima.predict(n_periods=forecast_months)
        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_kf = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')

        predicted_data_df = pd.DataFrame(
            {
                'ID': str(area) + '_' + test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )
    preds.append(predicted_data_df)
    


df_combine = pd.DataFrame(columns=['ID', 'burn_area'])
for n in range(48):
    df = pd.DataFrame(columns=['ID', 'burn_area'])
    for i in range(len(preds)):
        df = pd.concat([df, preds[i].loc[preds[n].index == n]], ignore_index=True)
    df_combine = pd.concat([df_combine, df], ignore_index=True)
sub['burn_area'] = df_combine['burn_area']
sub['burn_area'] = sub['burn_area'].clip(0, 1)
sub.to_csv('../submissions/submit_1.csv', index=False)

In [17]:
import numpy as np
from sklearn.metrics import mean_squared_error
rmse_values = []
predictions = []

for real_id in train['area'].unique():
    train_area = train.loc[train.area == real_id]
    test_area = test.loc[test.area == real_id]
    X_train_area = train_area.loc[train_area.index.strftime('%Y-%m-%d') < '2011-01-01']
    y_test_area = train_area.loc[train_area.index.strftime('%Y-%m-%d') >= '2011-01-01']

    try:
        model_arima = auto_arima(X_train_area['burn_area'], m=12, seasonal=True,
                                  start_p=0, start_q=0, max_order=5, test='adf', error_action='ignore',
                                  suppress_warnings=True, stepwise=True, trace=False)
        non_seasonal_order = model_arima.order
        best_seasonal_order = model_arima.seasonal_order

    except Exception as e:
        non_seasonal_order = (2, 0, 0)
        best_seasonal_order = (2, 0, 1, 12)

    try:
        sarimax_model = SARIMAX(X_train_area['burn_area'],
                                order=non_seasonal_order,
                                seasonal_order=best_seasonal_order)
        sarima_fit = sarimax_model.fit()
        forecast_months = len(y_test_area)
        sarimax_pred = sarima_fit.get_forecast(steps=forecast_months, freq='MS')
        predicted_values = sarimax_pred.predicted_mean
        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_df = pd.Series(predicted_values_kf.flatten(), index=y_test_area.index, name='burn_area')

    except Exception as e:
        try:
            model_arima = auto_arima(X_train_area['burn_area'], m=12, seasonal=True,
                                      start_p=0, start_q=0, max_order=5, test='adf', error_action='ignore',
                                      suppress_warnings=True, stepwise=True, trace=False)
            forecast_length = len(y_test_area)
            predicted_values = model_arima.predict(n_periods=forecast_length)
            kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
            predicted_values_kf, _ = kf.filter(predicted_values.values)
            predicted_data_df = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')

        except Exception as e:
            predicted_data_df = pd.Series(np.zeros(len(y_test_area)), index=y_test_area.index)

    predicted_data_df = predicted_data_df.reindex(y_test_area.index, method='nearest')
    predicted_data_df = pd.DataFrame(
            {
                'ID': str(area) + '_' + test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )
    predictions.append(predicted_data_df)

    rmse_area = np.sqrt(mean_squared_error(y_test_area['burn_area'], predicted_data_df))
    rmse_values.append(rmse_area)

if rmse_values:
    overall_rmse = np.mean(rmse_values)
    print('Overall Test RMSE:', overall_rmse)
else:
    print('No valid RMSE values were calculated')
    
    
df_combine = pd.DataFrame(columns=['ID', 'burn_area'])
for n in range(48):
    df = pd.DataFrame(columns=['ID', 'burn_area'])
    for i in range(len(predictions)):
        df = pd.concat([df, predictions[i].loc[predictions[n].index == n]], ignore_index=True)
    df_combine = pd.concat([df_combine, df], ignore_index=True)
sub['burn_area'] = df_combine['burn_area']
sub['burn_area'] = sub['burn_area'].clip(0, 1)
sub.to_csv('../submissions/submit_2.csv', index=False)

Failed to tune real_id 102: All lag values up to 'maxlag' produced singular matrices. Consider using a longer series, a different lag term or a different test.. Falling back to default seasonal and non-seasonal order
Failed to tune real_id 368: All lag values up to 'maxlag' produced singular matrices. Consider using a longer series, a different lag term or a different test.. Falling back to default seasonal and non-seasonal order
Failed to tune real_id 377: Encountered exception in stationarity test ('adf'). This can occur in seasonal settings when a large enough `m` coupled with a large enough `D` difference the training array into too few samples for OLS (input contains 108 samples). Try fitting on a larger training size (raised from LinAlgError: Singular matrix). Falling back to default seasonal and non-seasonal order
Overall Test RMSE: 0.014202937143650576


In [None]:
non_seasonal_order = (2, 0, 0)
best_seasonal_order = (2, 0, 1, 12)
rmse_values = []
predicted_values = []

for real_id in train['real_id'].unique():
    train_area = train.loc[train.real_id == real_id]
    test_area = test.loc[test.real_id == real_id]
    X_train_area = train_area.loc[train_area.index.strftime('%Y-%m-%d') < '2011-01-01']
    y_test_area = train_area.loc[train_area.index.strftime('%Y-%m-%d') >= '2011-01-01']

    try:
        sarimax_model = SARIMAX(X_train_area['burn_area'],
                                order=non_seasonal_order,
                                seasonal_order=best_seasonal_order)
        sarima_fit = sarimax_model.fit()
        forecast_months = len(y_test_area)
        sarimax_pred = sarima_fit.get_forecast(steps=forecast_months, freq='MS')
        predicted_values = sarimax_pred.predicted_mean
        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_df = pd.Series(predicted_values_kf.flatten(), index=y_test_area.index, name='burn_area')

    except Exception as e:
        print(f"Failed to model real_id {real_id}: {e}. Generating ARIMA forecast")
        try:
            model_arima = auto_arima(X_train_area['burn_area'], m=12, seasonal=True,
                                      start_p=0, start_q=0, max_order=5, test='adf', error_action='ignore',
                                      suppress_warnings=True, stepwise=True, trace=False)
            forecast_length = len(y_test_area)
            predicted_values = model_arima.predict(n_periods=forecast_length)
            kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
            predicted_values_kf, _ = kf.filter(predicted_values.values)
            predicted_data_df = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')

        except Exception as e:
            predicted_data_df = pd.Series(np.zeros(len(y_test_area)), index=y_test_area.index)

    predicted_data_df = predicted_data_df.reindex(y_test_area.index, method='nearest')
    predicted_data_df = pd.DataFrame(
            {
                'ID': str(area) + '_' + test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )
    predictions.append(predicted_data_df)
    rmse_area = np.sqrt(mean_squared_error(y_test_area['burn_area'], predicted_data_df))
    rmse_values.append(rmse_area)

if rmse_values:
    overall_rmse = np.mean(rmse_values)
    print('Overall Test RMSE:', overall_rmse)
else:
    print('No valid RMSE values were calculated')
    


df_combine = pd.DataFrame(columns=['ID', 'burn_area'])
for n in range(48):
    df = pd.DataFrame(columns=['ID', 'burn_area'])
    for i in range(len(predicted_values)):
        df = pd.concat([df, predicted_values[i].loc[predicted_values[n].index == n]], ignore_index=True)
    df_combine = pd.concat([df_combine, df], ignore_index=True)
sub['burn_area'] = df_combine['burn_area']
sub['burn_area'] = sub['burn_area'].clip(0, 1)
sub.to_csv('../submissions/submit_3.csv', index=False)