In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data_dir = "/home/angps/Documents/Thesis/Data/"
image_dir = "/home/angps/Documents/Thesis/Report/Images/"

df_atleast_50_cts = pd.read_csv(data_dir + 'data_>=50cts.csv')
full_df = pd.read_csv(data_dir + 'data.csv')

# **1. Subsetting data with at least 50 counts**

In [2]:
df_atleast_50_cts.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,195,196,197,198,199,200,201,202,203,204
0,0,1,5,3,1,1,1,4,6,2,...,7,0,1,1,2,1,2,1,1,1
1,1,2,3,0,3,0,1,2,4,2,...,0,0,2,0,0,0,0,0,1,0
2,0,0,2,5,1,0,0,4,4,0,...,2,1,0,0,1,0,1,1,0,0
3,0,2,7,5,4,0,4,4,4,2,...,2,0,0,1,3,0,5,1,0,0
4,0,0,1,6,0,0,1,3,3,1,...,0,1,0,0,1,0,3,0,0,0


# **2. Fit ARIMA model as baseline**

In [3]:
from statsmodels.tools.eval_measures import rmse, aic
from pmdarima.arima import auto_arima

# TRAIN TEST SPLIT:  Use 198 training data and 6 training data
train_df = df_atleast_50_cts.iloc[:, 0:198]
test_df = df_atleast_50_cts.iloc[:, 198:]
train_full = full_df.iloc[:, 0:198]
test_full = full_df.iloc[:, 198:]



In [4]:
def fit_arima(train, test):
    model = auto_arima(train, trace=False, error_action='ignore', suppress_warnings=True)  # Fit auto arima
    model.fit(train)
    forecast = model.predict(n_periods=len(test))  # Predict
    return forecast

def compute_errors(pred, act):
    err = np.square(np.subtract(pred, act)).sum()
    return err

## **2.1 ARIMA baseline for locations > 50 non-zero counts**

In [5]:
loss = []
fitted = []
MSE = 0
for i in range(len(train_df)):
    train = train_df.iloc[i,:]
    test = test_df.iloc[i,:]
    forecast = fit_arima(train, test)
    loss.extend(forecast-test.values)
    forecast_err = compute_errors(forecast, test)
    MSE += forecast_err
print("MSE for baseline ARIMA model on subset of dataset: " + str(round(MSE/6, 3)))

MSE for baseline ARIMA model on subset of dataset: 47.599


## **2.2 ARIMA baseline for all locations**

In [7]:
import warnings
warnings.filterwarnings('ignore')

loss = []
MSE = 0
for i in range(len(train_full)):
    train = train_full.iloc[i,:]
    test = test_full.iloc[i,:]
    forecast = fit_arima(train, test)
    forecast_err = compute_errors(forecast, test)
    MSE += forecast_err
print("MSE for baseline ARIMA model on full dataset: " + str(round(MSE/6, 3)))

MSE for baseline ARIMA model on full dataset: 73.29
