In [10]:
import pandas as pd
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pykalman import KalmanFilter
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [11]:
train = pd.read_csv('../data/Train.csv')
test = pd.read_csv('../data/Test.csv')
sub = pd.read_csv('../data/SampleSubmission.csv')

In [12]:
# 
train['date'] = pd.to_datetime(train['ID'].apply(lambda x: x.split('_')[1]))
train['month'] = train.date.dt.month
train['year'] = train.date.dt.year
train['area'] = pd.to_numeric(train['ID'].apply(lambda x: x.split('_')[0]))
train.set_index('date', inplace=True)

# Apply same procedure on test set
test['date'] = pd.to_datetime(test['ID'].apply(lambda x: x.split('_')[1]))
test['area'] = pd.to_numeric(test['ID'].apply(lambda x: x.split('_')[0]))
test.set_index('date', inplace=True)

# Logically splitting data
X_train = train.loc[train.index.strftime('%Y-%m-%d') < '2013-01-01']
y_test = train.loc[train.index.strftime('%Y-%m-%d') >= '2013-01-01']

In [14]:

preds = list()

for area in train['area'].unique():
    train_area = train.loc[train.area == area]
    test_area = test.loc[test.area == area]
    try:
        sarima_fit = SARIMAX(
            train_area['burn_area'],
            order=(2, 0, 0),
            seasonal_order=(2, 0, 1, 12)
        ).fit()
        forecast_months = len(test_area)
        sarimax_pred = sarima_fit.get_forecast(steps=forecast_months, freq='MS')
        predicted_values = sarimax_pred.predicted_mean

        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_kf = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')
        predicted_data_df = pd.DataFrame(
            {
                'ID': str(area)+'_'+test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )

    except Exception as e:
        model_arima= auto_arima(
            train_area['burn_area'], 
            m=12,
            seasonal=True,
            start_p=0,
            start_q=0,
            max_order=5,
            test='adf',
            error_action= 'ignore',
            suppress_warnings=True,
            stepwise= True, trace= True
        )

        forecast_months = len(test_area)
        predicted_values = model_arima.predict(n_periods=forecast_months)
        kf = KalmanFilter(initial_state_mean=predicted_values.iloc[0], n_dim_obs=1)
        predicted_values_kf, _ = kf.filter(predicted_values.values)
        predicted_data_kf = pd.Series(predicted_values_kf.flatten(), index=test_area.index, name='burn_area')

        predicted_data_df = pd.DataFrame(
            {
                'ID': str(area) + '_' + test_area.index.strftime('%Y-%m-%d').map(str),
                'burn_area': predicted_data_kf.values
            }
        )
    preds.append(predicted_data_df)
    


df_combine = pd.DataFrame(columns=['ID', 'burn_area'])
for n in range(48):
    df = pd.DataFrame(columns=['ID', 'burn_area'])
    for i in range(len(preds)):
        df = pd.concat([df, preds[i].loc[preds[n].index == n]], ignore_index=True)
    df_combine = pd.concat([df_combine, df], ignore_index=True)
sub['burn_area'] = df_combine['burn_area']
sub['burn_area'] = sub['burn_area'].clip(0, 1)
sub.to_csv('../submissions/submit_7.csv', index=False)