### Starting Over
I've gone over our analysis and picked up a couple of things:

1. We've identified that there appears to be some interaction affect focused on Saturdays.  This will mean covariance with our Saturday/Seasonal variable
2. Ideally, we shouldn't explicitly include Saturday - a properly fit time series model *should* account for it

In [20]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import patsy

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import pmdarima as pm
import seaborn as sns
from scipy.stats import zscore

crimes = ['TNOs', 'Drugs', 'Robbery','SexualOffences', 'Theft', 'VAP']


def calc_auto(patsy_string):
    """takes a Patsy string in the given format, returns a dataframe for all crime types"""

    result_df = pd.DataFrame()
    crimes = ['TNOs', 'Drugs', 'Robbery','SexualOffences', 'Theft', 'VAP']
    for crime in crimes:
        string_arima = crime + patsy_string
        y, X = patsy.dmatrices(string_arima, data=combined, return_type="dataframe")
        auto = pm.auto_arima(y,exogenous=X)
        #result_df[crime+"_params"] = auto.to_dict()['params'].round(3)
        try:
            X = X.drop(columns="C(Saturday)[False]")
        except:
            pass
        for value in auto.to_dict()['params'].keys():
            result_df.loc[value, crime+"_params"] = auto.to_dict()['params'][value]
        result_df[crime+"_p"] = auto.to_dict()['pvalues'].round(3)
        predictions = auto.predict_in_sample(exogenous=X)
        mse = mean_squared_error(y, predictions)
        result_df.loc["Order", crime+"_fit"] = str(auto.to_dict()['order'])
        result_df.loc["SeasonOrder", crime+"_fit"] = str(auto.to_dict()['seasonal_order'])
        result_df.loc["AIC", crime+"_fit"] = auto.aic()
        result_df.loc["BIC", crime+"_fit"] = auto.bic()
        result_df.loc["MSE", crime+"_fit"] = mse
        result_df.loc["RMSE", crime+"_fit"] = str(np.sqrt(mse))

    result_df.to_clipboard()
    return result_df

def calc_weekend(patsy_string):
    """takes a Patsy string in the given format, returns a dataframe for all crime types, with a weekend season"""

    result_df = pd.DataFrame()
    crimes = ['TNOs', 'Drugs', 'Robbery','SexualOffences', 'Theft', 'VAP']
    for crime in crimes:
        string_arima = crime + patsy_string
        y, X = patsy.dmatrices(string_arima, data=combined, return_type="dataframe")
        auto = pm.auto_arima(y,exogenous=X, m=2)
        #result_df[crime+"_params"] = auto.to_dict()['params'].round(3)
        try:
            X = X.drop(columns="C(Saturday)[False]")
        except:
            pass
        for value in auto.to_dict()['params'].keys():
            result_df.loc[value, crime+"_params"] = auto.to_dict()['params'][value]
        result_df[crime+"_p"] = auto.to_dict()['pvalues'].round(3)
        predictions = auto.predict_in_sample(exogenous=X)
        mse = mean_squared_error(y, predictions)
        result_df.loc["Order", crime+"_fit"] = str(auto.to_dict()['order'])
        result_df.loc["SeasonOrder", crime+"_fit"] = str(auto.to_dict()['seasonal_order'])
        result_df.loc["AIC", crime+"_fit"] = auto.aic()
        result_df.loc["BIC", crime+"_fit"] = auto.bic()
        result_df.loc["MSE", crime+"_fit"] = mse
        result_df.loc["RMSE", crime+"_fit"] = str(np.sqrt(mse))

    result_df.to_clipboard()
    return result_df


def calc_arima(patsy_string):
    """takes a Patsy string in the given format, returns a dataframe for all crime types"""

    result_df = pd.DataFrame()
    crimes = ['TNOs', 'Drugs', 'Robbery','SexualOffences', 'Theft', 'VAP']
    for crime in crimes:
        string_arima = crime + patsy_string
        y, X = patsy.dmatrices(string_arima, data=combined, return_type="dataframe")
        try:
            X = X.drop(columns=["C(Saturday)[False]"])
        except:
            pass
        model = ARIMA(y, order=(0,1,1), exog=X, dates=dates)
        results = model.fit()        #result_df[crime+"_params"] = auto.to_dict()['params'].round(3)
        for value in results.params.to_dict().keys():
            result_df.loc[value, crime+"_params"] = results.params.to_dict()[value]
        result_df[crime+"_p"] = results.pvalues.round(3)
        mse = results.mse
        result_df.loc["AIC", crime+"_fit"] = results.aic
        result_df.loc["BIC", crime+"_fit"] = results.bic
        result_df.loc["MSE", crime+"_fit"] = mse
        result_df.loc["RMSE", crime+"_fit"] = str(np.sqrt(mse))

    result_df.to_clipboard()
    return result_df





In [2]:
df = pd.read_csv("weekend_complete.csv").drop(columns=["date", "date.1"])
df["date"] = pd.to_datetime(df["date_new"])
week_mask = "Fri Sat"
dates = pd.bdate_range("2018-01-26", "2018-07-28", freq="C", weekmask=week_mask)

dateframe = pd.DataFrame(index=dates)

dateframe["DayOfWeek"] = dateframe.index.weekday.astype("category")

dateframe["date"] = dateframe.index

combined = dateframe.merge(df, how="left", on="date")
combined.index = dateframe.index

combined.rename(columns={"MSC Numbers":"OfficerDosage", "Sexual Offences":"SexualOffences", "Theft and Handling":"Theft"}, inplace=True)

combined["Saturday"] = combined["DayOfWeek"] == 5

for col in ['OfficerDosage',
       'TNOs', 'Drugs', 'Robbery',
       'SexualOffences', 'Theft', 'VAP', 'AvTemp', 'Precip']:
    new_name = "z_" + col
    combined[new_name] = zscore(combined[col])

combined

Unnamed: 0,DayOfWeek,date,date_new,month,year,OfficerDosage,TNOs,isWeekend,Average of Day of Week,Drugs,...,Saturday,z_OfficerDosage,z_TNOs,z_Drugs,z_Robbery,z_SexualOffences,z_Theft,z_VAP,z_AvTemp,z_Precip
2018-01-26,4,2018-01-26,2018-1-26,1,2018,0,1,1,5,0,...,False,-1.463967,-1.953031,-0.604165,0.714231,-0.390434,-1.578752,-1.41348,-0.860979,-0.497098
2018-01-27,5,2018-01-27,2018-1-27,1,2018,0,2,1,6,0,...,True,-1.463967,-1.761974,-0.604165,-0.962659,-0.390434,-1.310662,-0.911322,-0.265527,-0.228963
2018-02-02,4,2018-02-02,2018-2-2,2,2018,9,8,1,5,0,...,False,0.787577,-0.615629,-0.604165,0.714231,-0.390434,-0.506392,-0.409165,-1.067402,-0.497098
2018-02-03,5,2018-02-03,2018-2-3,2,2018,4,16,1,6,0,...,True,-0.463281,0.91283,-0.604165,2.39112,-0.390434,1.370238,-0.409165,-1.353219,0.932955
2018-02-09,4,2018-02-09,2018-2-9,2,2018,10,12,1,5,0,...,False,1.037749,0.1486,-0.604165,0.714231,-0.390434,0.297878,-0.409165,-1.45643,1.796945
2018-02-10,5,2018-02-10,2018-2-10,2,2018,6,11,1,6,0,...,True,0.037062,-0.042457,-0.604165,0.714231,1.717911,0.029788,-0.409165,-0.511647,0.605234
2018-02-16,4,2018-02-16,2018-2-16,2,2018,5,8,1,5,1,...,False,-0.213109,-0.615629,1.112935,-0.962659,-0.390434,0.029788,-1.41348,-1.130917,-0.497098
2018-02-17,5,2018-02-17,2018-2-17,2,2018,5,26,1,6,0,...,True,-0.213109,2.823404,-0.604165,0.714231,3.826257,2.710687,1.599464,-1.059463,-0.497098
2018-02-23,4,2018-02-23,2018-2-23,2,2018,11,6,1,5,1,...,False,1.287921,-0.997744,1.112935,-0.962659,-0.390434,-1.042572,0.092992,-1.631096,-0.497098
2018-02-24,5,2018-02-24,2018-2-24,2,2018,3,7,1,6,0,...,True,-0.713452,-0.806687,-0.604165,0.714231,-0.390434,-0.506392,-0.409165,-1.496127,-0.497098


I've rebuilt the data on first principles here.

#### Auto ARIMA and Saturday Effect
We begin by testing our Saturday model and our auto-ARIMA fit.  Let's compare this to the ARIMA we had earlier.

In [3]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + C(Saturday) + AvTemp", data=combined, return_type="dataframe")

X

Unnamed: 0,C(Saturday)[False],C(Saturday)[True],OfficerDosage,AvTemp
2018-01-26,1.0,0.0,0.0,7.35
2018-01-27,0.0,1.0,0.0,11.1
2018-02-02,1.0,0.0,9.0,6.05
2018-02-03,0.0,1.0,4.0,4.25
2018-02-09,1.0,0.0,10.0,3.6
2018-02-10,0.0,1.0,6.0,9.55
2018-02-16,1.0,0.0,5.0,5.65
2018-02-17,0.0,1.0,5.0,6.1
2018-02-23,1.0,0.0,11.0,2.5
2018-02-24,0.0,1.0,3.0,3.35


In [4]:
# 0,1,1 ARIMA model
model = ARIMA(y, order=(0,1,1), exog=X.drop(columns=["C(Saturday)[False]"]), dates=dates)
model_fit = model.fit()
model_fit.summary()



0,1,2,3
Dep. Variable:,TNOs,No. Observations:,54.0
Model:,"ARIMA(0, 1, 1)",Log Likelihood,-160.082
Date:,"Mon, 05 Jul 2021",AIC,330.165
Time:,17:46:26,BIC,340.016
Sample:,0,HQIC,333.953
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(Saturday)[True],4.6247,1.788,2.586,0.010,1.120,8.130
OfficerDosage,0.0925,0.262,0.353,0.724,-0.421,0.605
AvTemp,-0.2783,0.176,-1.582,0.114,-0.623,0.067
ma.L1,-0.8850,0.082,-10.767,0.000,-1.046,-0.724
sigma2,23.9058,4.680,5.108,0.000,14.733,33.079

0,1,2,3
Ljung-Box (L1) (Q):,0.12,Jarque-Bera (JB):,0.37
Prob(Q):,0.73,Prob(JB):,0.83
Heteroskedasticity (H):,0.57,Skew:,0.19
Prob(H) (two-sided):,0.24,Kurtosis:,3.14


In [5]:
predictions = model_fit.predict(typ='levels')
mse = mean_squared_error(y, predictions)
print("Mean Squared Error:" + str(mse))
print("RMSE:" + str(np.sqrt(mse)))

Mean Squared Error:24.457131800701738
RMSE:4.945415230362537


In [6]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage*C(Saturday) + AvTemp", data=combined, return_type="dataframe")

# 0,1,1 ARIMA model
model = ARIMA(y, order=(0,1,1), exog=X.drop(columns=["C(Saturday)[False]"]), dates=dates)
model_fit = model.fit()
model_fit.summary()



0,1,2,3
Dep. Variable:,TNOs,No. Observations:,54.0
Model:,"ARIMA(0, 1, 1)",Log Likelihood,-159.771
Date:,"Mon, 05 Jul 2021",AIC,331.542
Time:,17:46:26,BIC,343.364
Sample:,0,HQIC,336.088
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(Saturday)[True],6.2465,3.807,1.641,0.101,-1.215,13.708
OfficerDosage,0.1889,0.357,0.529,0.597,-0.511,0.889
OfficerDosage:C(Saturday)[T.True],-0.3244,0.554,-0.586,0.558,-1.410,0.761
AvTemp,-0.2742,0.187,-1.464,0.143,-0.641,0.093
ma.L1,-0.8884,0.088,-10.123,0.000,-1.060,-0.716
sigma2,23.6138,4.745,4.977,0.000,14.315,32.913

0,1,2,3
Ljung-Box (L1) (Q):,0.1,Jarque-Bera (JB):,0.6
Prob(Q):,0.75,Prob(JB):,0.74
Heteroskedasticity (H):,0.53,Skew:,0.26
Prob(H) (two-sided):,0.18,Kurtosis:,3.07


In [7]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage:C(Saturday) + AvTemp", data=combined, return_type="dataframe")

# 0,1,1 ARIMA model
model = ARIMA(y, order=(0,1,1), exog=X, dates=dates)
model_fit = model.fit()
model_fit.summary()



0,1,2,3
Dep. Variable:,TNOs,No. Observations:,54.0
Model:,"ARIMA(0, 1, 1)",Log Likelihood,-162.651
Date:,"Mon, 05 Jul 2021",AIC,335.302
Time:,17:46:26,BIC,345.153
Sample:,0,HQIC,339.09
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
OfficerDosage:C(Saturday)[False],-0.2111,0.222,-0.950,0.342,-0.647,0.225
OfficerDosage:C(Saturday)[True],0.2781,0.342,0.813,0.416,-0.392,0.948
AvTemp,-0.3111,0.192,-1.620,0.105,-0.688,0.065
ma.L1,-0.8689,0.081,-10.707,0.000,-1.028,-0.710
sigma2,26.3995,5.103,5.173,0.000,16.397,36.402

0,1,2,3
Ljung-Box (L1) (Q):,0.46,Jarque-Bera (JB):,0.8
Prob(Q):,0.5,Prob(JB):,0.67
Heteroskedasticity (H):,0.61,Skew:,0.29
Prob(H) (two-sided):,0.3,Kurtosis:,3.12


Notice how in the above 2 models, by adding and removing our Saturday model and the interaction terms, the model fit fluctuates, but none become significant.

I suspect this is because all 3 - Saturday, Officer numbers, and interaction of the two - are covarying and conveying pretty similar information to the model


In [8]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + C(Saturday) + AvTemp", data=combined, return_type="dataframe")

model = ARIMA(y, order=(0,1,1), exog=X.drop(columns=["C(Saturday)[False]"]), dates=dates)
model_fit = model.fit()
model_fit.summary()



0,1,2,3
Dep. Variable:,TNOs,No. Observations:,54.0
Model:,"ARIMA(0, 1, 1)",Log Likelihood,-160.082
Date:,"Mon, 05 Jul 2021",AIC,330.165
Time:,17:46:26,BIC,340.016
Sample:,0,HQIC,333.953
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(Saturday)[True],4.6247,1.788,2.586,0.010,1.120,8.130
OfficerDosage,0.0925,0.262,0.353,0.724,-0.421,0.605
AvTemp,-0.2783,0.176,-1.582,0.114,-0.623,0.067
ma.L1,-0.8850,0.082,-10.767,0.000,-1.046,-0.724
sigma2,23.9058,4.680,5.108,0.000,14.733,33.079

0,1,2,3
Ljung-Box (L1) (Q):,0.12,Jarque-Bera (JB):,0.37
Prob(Q):,0.73,Prob(JB):,0.83
Heteroskedasticity (H):,0.57,Skew:,0.19
Prob(H) (two-sided):,0.24,Kurtosis:,3.14


In [9]:
predictions = model_fit.predict(typ='levels')
mse = mean_squared_error(y, predictions)
print("Mean Squared Error:" + str(mse))
print("RMSE:" + str(np.sqrt(mse)))

Mean Squared Error:24.457131800701738
RMSE:4.945415230362537


So our initial model selected, above, with an AIC of 330 - a 0,1,1 ARIMA with Temperature, Dosage and a Saturday boolean - continues to be the best performing, according to AIC and BIC.

Let's quickly do the repeat with all other crime types.

In [10]:
calc_arima(" ~ 0 + OfficerDosage + C(Saturday) + AvTemp")



Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
C(Saturday)[True],4.624698,0.01,,0.159731,0.359,,0.23294,0.229,,0.269997,0.095,,2.970185,0.015,,1.075691,0.168,
OfficerDosage,0.092455,0.724,,0.011471,0.714,,0.026759,0.191,,0.028759,0.28,,0.043184,0.823,,0.062953,0.513,
AvTemp,-0.278276,0.114,,-2.5e-05,0.999,,-0.022766,0.198,,-0.000405,0.985,,-0.121395,0.138,,0.012496,0.815,
ma.L1,-0.885037,0.0,,-0.999268,0.85,,-0.999835,0.977,,-0.998922,0.872,,-0.999626,0.925,,-0.999884,0.982,
sigma2,23.905801,0.0,,0.341018,0.85,,0.329639,0.977,,0.214124,0.871,,11.647709,0.925,,3.81963,0.982,
AIC,,,330.164731,,,107.354828,,,105.571273,,,82.649621,,,294.498496,,,235.417166
BIC,,,340.016191,,,117.206287,,,115.422733,,,92.50108,,,304.349956,,,245.268625
MSE,,,24.457132,,,0.3523,,,0.381267,,,0.227365,,,12.446416,,,3.946884
RMSE,,,4.945415230362537,,,0.5935489304984939,,,0.6174681160756514,,,0.4768277131231525,,,3.5279477931393672,,,1.9866765427344084


Let's now compare to our auto-arima, seasonal and unseasonal.
We'll then try with and without the Saturday and the interaction.


In [11]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + C(Saturday) + AvTemp", data=combined, return_type="dataframe")


auto = pm.auto_arima(y,exogenous=X.drop(columns=["C(Saturday)[False]"]), d=1, seasonal=False,
                     suppress_warnings=True, error_action="ignore", max_p=6,
                     max_order=None)


auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,54.0
Model:,"SARIMAX(0, 1, 1)",Log Likelihood,-160.082
Date:,"Mon, 05 Jul 2021",AIC,330.165
Time:,17:46:32,BIC,340.016
Sample:,0,HQIC,333.953
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(Saturday)[True],4.6247,1.788,2.586,0.010,1.120,8.130
OfficerDosage,0.0925,0.262,0.353,0.724,-0.421,0.605
AvTemp,-0.2783,0.176,-1.582,0.114,-0.623,0.067
ma.L1,-0.8850,0.082,-10.767,0.000,-1.046,-0.724
sigma2,23.9058,4.680,5.108,0.000,14.733,33.079

0,1,2,3
Ljung-Box (L1) (Q):,0.12,Jarque-Bera (JB):,0.37
Prob(Q):,0.73,Prob(JB):,0.83
Heteroskedasticity (H):,0.57,Skew:,0.19
Prob(H) (two-sided):,0.24,Kurtosis:,3.14


Our auto-arima has selected the same model we did (which is a win).  Let's also try a few variations.

In [12]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + C(Saturday) + AvTemp", data=combined, return_type="dataframe")


auto = pm.auto_arima(y,exogenous=X.drop(columns=["C(Saturday)[False]"]))


auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,54.0
Model:,SARIMAX,Log Likelihood,-160.636
Date:,"Mon, 05 Jul 2021",AIC,331.273
Time:,17:46:35,BIC,341.217
Sample:,0,HQIC,335.108
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,9.0111,2.157,4.178,0.000,4.784,13.238
C(Saturday)[True],4.8401,1.654,2.927,0.003,1.599,8.081
OfficerDosage,0.1390,0.212,0.656,0.512,-0.276,0.554
AvTemp,-0.0800,0.094,-0.849,0.396,-0.265,0.105
sigma2,22.4570,4.328,5.189,0.000,13.975,30.939

0,1,2,3
Ljung-Box (L1) (Q):,0.03,Jarque-Bera (JB):,0.49
Prob(Q):,0.86,Prob(JB):,0.78
Heteroskedasticity (H):,0.54,Skew:,0.21
Prob(H) (two-sided):,0.2,Kurtosis:,3.18


In [13]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + C(Saturday) + AvTemp", data=combined, return_type="dataframe")


arima = pm.auto_arima(y, exogenous=X.drop(columns=["C(Saturday)[False]"]), error_action='ignore', trace=True,
                      suppress_warnings=True, maxiter=30,
                      seasonal=False)

arima.summary()

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=348.004, Time=0.47 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=343.927, Time=0.04 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=345.493, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=345.341, Time=0.06 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=inf, Time=0.14 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=331.273, Time=0.11 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=333.241, Time=0.19 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=333.227, Time=0.12 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=335.218, Time=0.23 sec

Best model:  ARIMA(0,0,0)(0,0,0)[0] intercept
Total fit time: 1.441 seconds


0,1,2,3
Dep. Variable:,y,No. Observations:,54.0
Model:,SARIMAX,Log Likelihood,-160.636
Date:,"Mon, 05 Jul 2021",AIC,331.273
Time:,17:46:36,BIC,341.217
Sample:,0,HQIC,335.108
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,9.0111,2.157,4.178,0.000,4.784,13.238
C(Saturday)[True],4.8401,1.654,2.927,0.003,1.599,8.081
OfficerDosage,0.1390,0.212,0.656,0.512,-0.276,0.554
AvTemp,-0.0800,0.094,-0.849,0.396,-0.265,0.105
sigma2,22.4570,4.328,5.189,0.000,13.975,30.939

0,1,2,3
Ljung-Box (L1) (Q):,0.03,Jarque-Bera (JB):,0.49
Prob(Q):,0.86,Prob(JB):,0.78
Heteroskedasticity (H):,0.54,Skew:,0.21
Prob(H) (two-sided):,0.2,Kurtosis:,3.18


If you don't specify anything, the default picks a very simple model, but that actually doesn't improve our fit loads either.

In [34]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + C(Saturday) + AvTemp", data=combined, return_type="dataframe")

auto = pm.auto_arima(y,exogenous=X.drop(columns=["C(Saturday)[False]"]), d=1, seasonal=True,
                     suppress_warnings=True, error_action="ignore", max_p=6,
                     max_order=None)

auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,54.0
Model:,"SARIMAX(0, 1, 1)",Log Likelihood,-160.082
Date:,"Mon, 05 Jul 2021",AIC,330.165
Time:,07:24:31,BIC,340.016
Sample:,0,HQIC,333.953
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(Saturday)[True],4.6247,1.788,2.586,0.010,1.120,8.130
OfficerDosage,0.0925,0.262,0.353,0.724,-0.421,0.605
AvTemp,-0.2783,0.176,-1.582,0.114,-0.623,0.067
ma.L1,-0.8850,0.082,-10.767,0.000,-1.046,-0.724
sigma2,23.9058,4.680,5.108,0.000,14.733,33.079

0,1,2,3
Ljung-Box (L1) (Q):,0.12,Jarque-Bera (JB):,0.37
Prob(Q):,0.73,Prob(JB):,0.83
Heteroskedasticity (H):,0.57,Skew:,0.19
Prob(H) (two-sided):,0.24,Kurtosis:,3.14


Notice how even with an explicitly seasonal model, a seasonal component is *not* added - it sticks to our original model.  Could this be because we're explicitly adding it as an exogenous variable?
Or did we not specify the M?

In [16]:

y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + AvTemp", data=combined, return_type="dataframe")

auto = pm.auto_arima(y,exogenous=X, seasonal=True,
                     suppress_warnings=True, error_action="ignore", max_p=6, m=2,
                     max_order=None)

auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,54.0
Model:,SARIMAX,Log Likelihood,-165.287
Date:,"Mon, 05 Jul 2021",AIC,338.574
Time:,17:47:40,BIC,346.53
Sample:,0,HQIC,341.643
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,13.2138,1.516,8.714,0.000,10.242,16.186
OfficerDosage,-0.1805,0.219,-0.826,0.409,-0.609,0.248
AvTemp,-0.0732,0.100,-0.730,0.465,-0.270,0.123
sigma2,26.6787,5.129,5.202,0.000,16.626,36.731

0,1,2,3
Ljung-Box (L1) (Q):,0.37,Jarque-Bera (JB):,1.78
Prob(Q):,0.54,Prob(JB):,0.41
Heteroskedasticity (H):,0.5,Skew:,0.42
Prob(H) (two-sided):,0.15,Kurtosis:,3.31


In [37]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + AvTemp", data=combined, return_type="dataframe")

auto = pm.auto_arima(y,exogenous=X, d=1, seasonal=True,
                     suppress_warnings=True, error_action="ignore", max_p=6,
                     max_order=None)

auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,54.0
Model:,"SARIMAX(0, 1, 1)",Log Likelihood,-164.443
Date:,"Mon, 05 Jul 2021",AIC,336.885
Time:,07:26:18,BIC,344.767
Sample:,0,HQIC,339.916
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
OfficerDosage,-0.2443,0.241,-1.012,0.311,-0.717,0.229
AvTemp,-0.3367,0.194,-1.736,0.082,-0.717,0.043
ma.L1,-0.8632,0.094,-9.177,0.000,-1.048,-0.679
sigma2,28.2673,4.876,5.797,0.000,18.710,37.824

0,1,2,3
Ljung-Box (L1) (Q):,1.41,Jarque-Bera (JB):,3.03
Prob(Q):,0.23,Prob(JB):,0.22
Heteroskedasticity (H):,0.51,Skew:,0.58
Prob(H) (two-sided):,0.16,Kurtosis:,3.15


Apparently not...even removing Saturday, no seasonal component is added. What if we remove the binary, but keep the interaction?

In [39]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage:C(Saturday) + AvTemp", data=combined, return_type="dataframe")

auto = pm.auto_arima(y,exogenous=X, d=1, seasonal=True,
                     suppress_warnings=True, error_action="ignore", max_p=6,
                     max_order=None)

auto.summary()


0,1,2,3
Dep. Variable:,y,No. Observations:,54.0
Model:,"SARIMAX(4, 1, 0)",Log Likelihood,-162.33
Date:,"Mon, 05 Jul 2021",AIC,340.66
Time:,07:28:03,BIC,356.422
Sample:,0,HQIC,346.722
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
OfficerDosage:C(Saturday)[False],-0.3295,0.214,-1.539,0.124,-0.749,0.090
OfficerDosage:C(Saturday)[True],0.2456,0.375,0.654,0.513,-0.490,0.981
AvTemp,-0.3539,0.238,-1.487,0.137,-0.820,0.113
ar.L1,-0.8200,0.160,-5.140,0.000,-1.133,-0.507
ar.L2,-0.6990,0.167,-4.177,0.000,-1.027,-0.371
ar.L3,-0.4683,0.166,-2.825,0.005,-0.793,-0.143
ar.L4,-0.3521,0.158,-2.234,0.025,-0.661,-0.043
sigma2,26.1648,5.757,4.545,0.000,14.881,37.449

0,1,2,3
Ljung-Box (L1) (Q):,0.04,Jarque-Bera (JB):,0.35
Prob(Q):,0.85,Prob(JB):,0.84
Heteroskedasticity (H):,0.56,Skew:,0.11
Prob(H) (two-sided):,0.23,Kurtosis:,2.66


Well, that's weird - it's added 4 auto regressive terms.  Maybe this is essentially monthly seasonality?  That said, AIC and BIC both worse.

I am not entirely convinced by auto-arima...it should be minimising AIC, but it's not.  Let's try adding our own seasonal component.

In [48]:
y, X = patsy.dmatrices("TNOs ~ 0 + C(Saturday) + OfficerDosage + AvTemp", data=combined, return_type="dataframe")

model = ARIMA(y, order=(0,1,1), exog=X.drop(columns=["C(Saturday)[False]"]), dates=dates,
              seasonal_order=(0, 1, 1, 2))
model_fit = model.fit()
model_fit.summary()





0,1,2,3
Dep. Variable:,TNOs,No. Observations:,54.0
Model:,"ARIMA(0, 1, 1)x(0, 1, 1, 2)",Log Likelihood,-157.205
Date:,"Mon, 05 Jul 2021",AIC,326.411
Time:,07:41:59,BIC,338.002
Sample:,0,HQIC,330.84
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(Saturday)[True],0.0156,6.27e+04,2.49e-07,1.000,-1.23e+05,1.23e+05
OfficerDosage,0.0141,0.283,0.050,0.960,-0.540,0.568
AvTemp,-0.5998,0.215,-2.784,0.005,-1.022,-0.178
ma.L1,-0.9999,169.403,-0.006,0.995,-333.024,331.024
ma.S.L2,-0.9999,355.876,-0.003,0.998,-698.504,696.504
sigma2,20.8909,6529.705,0.003,0.997,-1.28e+04,1.28e+04

0,1,2,3
Ljung-Box (L1) (Q):,0.06,Jarque-Bera (JB):,0.2
Prob(Q):,0.81,Prob(JB):,0.91
Heteroskedasticity (H):,0.5,Skew:,-0.14
Prob(H) (two-sided):,0.17,Kurtosis:,3.09


By adding a seasonal component, while it isn't significant, our Saturday boolean is rendered totally useless.  Can we remove it, or play with interactions instead?


In [51]:
y, X = patsy.dmatrices("TNOs ~ 0 + OfficerDosage + AvTemp", data=combined, return_type="dataframe")

model = ARIMA(y, order=(0,1,1), exog=X, dates=dates,
              seasonal_order=(0, 1, 1, 2))
model_fit = model.fit()
model_fit.summary()



0,1,2,3
Dep. Variable:,TNOs,No. Observations:,54.0
Model:,"ARIMA(0, 1, 1)x(0, 1, 1, 2)",Log Likelihood,-157.205
Date:,"Mon, 05 Jul 2021",AIC,324.411
Time:,07:45:33,BIC,334.07
Sample:,0,HQIC,328.102
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
OfficerDosage,0.0141,0.281,0.050,0.960,-0.537,0.566
AvTemp,-0.5998,0.208,-2.880,0.004,-1.008,-0.192
ma.L1,-0.9995,25.052,-0.040,0.968,-50.101,48.102
ma.S.L2,-0.9995,52.327,-0.019,0.985,-103.559,101.560
sigma2,20.9048,961.469,0.022,0.983,-1863.540,1905.349

0,1,2,3
Ljung-Box (L1) (Q):,0.06,Jarque-Bera (JB):,0.2
Prob(Q):,0.81,Prob(JB):,0.91
Heteroskedasticity (H):,0.5,Skew:,-0.14
Prob(H) (two-sided):,0.17,Kurtosis:,3.09


In [53]:
predictions = model_fit.predict(typ='levels')
mse = mean_squared_error(y, predictions)
print("Mean Squared Error:" + str(mse))
print("RMSE:" + str(np.sqrt(mse)))

Mean Squared Error:29.282604876935764
RMSE:5.411340395589226


In [56]:
y, X = patsy.dmatrices("TNOs ~ 0 + C(Saturday):OfficerDosage + AvTemp", data=combined, return_type="dataframe")

model = ARIMA(y, order=(0,1,1), exog=X, dates=dates,
              seasonal_order=(0, 1, 1, 2))
model_fit = model.fit()
model_fit.summary()



0,1,2,3
Dep. Variable:,TNOs,No. Observations:,54.0
Model:,"ARIMA(0, 1, 1)x(0, 1, 1, 2)",Log Likelihood,-156.816
Date:,"Mon, 05 Jul 2021",AIC,325.631
Time:,07:47:23,BIC,337.222
Sample:,0,HQIC,330.061
,- 54,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
C(Saturday)[False]:OfficerDosage,0.1168,0.330,0.354,0.723,-0.529,0.763
C(Saturday)[True]:OfficerDosage,-0.2380,0.397,-0.599,0.549,-1.016,0.540
AvTemp,-0.6009,0.223,-2.698,0.007,-1.037,-0.164
ma.L1,-0.9996,34.274,-0.029,0.977,-68.175,66.176
ma.S.L2,-0.9997,79.149,-0.013,0.990,-156.129,154.130
sigma2,20.5841,1512.706,0.014,0.989,-2944.264,2985.432

0,1,2,3
Ljung-Box (L1) (Q):,0.07,Jarque-Bera (JB):,0.18
Prob(Q):,0.79,Prob(JB):,0.92
Heteroskedasticity (H):,0.44,Skew:,-0.12
Prob(H) (two-sided):,0.1,Kurtosis:,3.15


In [57]:
predictions = model_fit.predict(typ='levels')
mse = mean_squared_error(y, predictions)
print("Mean Squared Error:" + str(mse))
print("RMSE:" + str(np.sqrt(mse)))

Mean Squared Error:29.471207735112987
RMSE:5.428739055721226


The above 2 models are interesting.  We have lost the Saturday variable, while improving the AIC, which suggests better use of our time series models...but strangely, our RMSE is higher.  Which is weird.

Let's now run an auto-arima on all the crime types, with and without Saturday

In [76]:
calc_auto(" ~ 0 + OfficerDosage + AvTemp + Precip")


Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
intercept,12.954242,0.0,,0.364662,0.275,,0.846144,0.0,,,,,8.043879,0.0,,2.498569,0.0,
OfficerDosage,-0.253629,0.285,,0.023081,0.309,,-0.016363,0.321,,0.024818,0.256,,-0.219813,0.176,,-0.013057,0.849,
AvTemp,-0.052634,0.621,,-0.002363,0.862,,-0.020247,0.0,,0.004526,0.665,,-0.098424,0.22,,0.02042,0.512,
Precip,0.254376,0.444,,-0.028384,0.453,,0.034619,0.206,,-0.022397,0.664,,0.232787,0.503,,0.088064,0.245,
sigma2,26.053307,0.0,,0.315718,0.0,,0.255829,0.001,,0.22012,0.0,,12.494416,0.0,,3.720168,0.0,
Order,,,"(0, 0, 0)",,,"(1, 0, 0)",,,"(0, 0, 3)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 1)"
SeasonOrder,,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)"
AIC,,,339.294395,,,103.050623,,,96.524446,,,79.510474,,,299.610751,,,236.266593
BIC,,,349.239315,,,114.984527,,,112.436319,,,87.46641,,,309.555672,,,248.200497
MSE,,,26.053895,,,0.315807,,,0.261347,,,0.220114,,,12.494456,,,3.729848


In [77]:
calc_auto(" ~ 0 + OfficerDosage + AvTemp")


Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
intercept,13.213758,0.0,,0.335855,0.272,,1.542766,0.0,,,,,8.280988,0.0,,2.588048,0.0,
OfficerDosage,-0.1805,0.409,,0.014133,0.56,,0.001832,0.87,,0.017207,0.422,,-0.152867,0.337,,0.008304,0.899,
AvTemp,-0.073228,0.465,,-0.000437,0.973,,-0.022598,0.003,,0.005327,0.611,,-0.117246,0.095,,0.014881,0.658,
sigma2,26.678727,0.0,,0.323952,0.0,,0.25743,0.0,,0.225036,0.0,,13.017937,0.0,,3.788312,0.0,
Order,,,"(0, 0, 0)",,,"(1, 0, 0)",,,"(3, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 1)"
SeasonOrder,,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)"
AIC,,,338.574234,,,102.43298,,,94.309836,,,78.702568,,,299.826318,,,235.215567
BIC,,,346.53017,,,112.377901,,,108.232725,,,84.669521,,,307.782255,,,245.160487
MSE,,,26.678767,,,0.324027,,,0.258182,,,0.225027,,,13.017753,,,3.794478
RMSE,,,5.165149265348742,,,0.5692339215991741,,,0.5081162047715103,,,0.4743705060998726,,,3.608012287330364,,,1.94794192487178


In [78]:
calc_auto(" ~ 0 + OfficerDosage*C(Saturday) + AvTemp + Precip")


Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
C(Saturday)[False],8.234467,0.003,,0.402554,0.354,,0.600152,0.008,,0.062996,0.936,,5.14937,0.01,,1.772405,0.005,
C(Saturday)[True],14.226888,0.0,,0.185626,0.557,,0.922537,0.0,,0.00144,0.998,,8.825351,0.0,,2.978031,0.0,
OfficerDosage,0.142235,0.656,,-0.005634,0.9,,0.015076,0.457,,0.01429,0.821,,0.02304,0.93,,0.039916,0.681,
OfficerDosage:C(Saturday)[T.True],-0.234031,0.6,,0.075953,0.216,,-0.031714,0.475,,0.066877,0.502,,-0.144034,0.683,,-0.038759,0.817,
AvTemp,-0.060754,0.55,,-0.001776,0.912,,-0.02011,0.0,,-0.002137,0.901,,-0.103395,0.155,,0.0148,0.695,
Precip,0.223453,0.501,,-0.017198,0.643,,0.019733,0.4,,-0.017078,0.77,,0.213745,0.421,,0.052935,0.612,
sigma2,21.729977,0.0,,0.315124,0.0,,0.246038,0.0,,0.193944,0.002,,10.869675,0.0,,3.035384,0.079,
Order,,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(2, 0, 1)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(1, 0, 2)"
SeasonOrder,,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)"
AIC,,,333.494892,,,104.885589,,,97.886543,,,78.67643,,,296.088276,,,235.660378


In [79]:
calc_auto(" ~ 0 + OfficerDosage:C(Saturday) + AvTemp + Precip")



Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
intercept,11.795635,0.0,,,,,1.854377,0.0,,,,,7.334017,0.0,,5.17284,0.008,
OfficerDosage:C(Saturday)[False],-0.250694,0.287,,0.026961,0.175,,-0.010041,0.392,,0.019465,0.452,,-0.218006,0.162,,-0.050021,0.347,
OfficerDosage:C(Saturday)[True],0.315676,0.285,,0.092621,0.0,,0.004329,0.877,,0.079115,0.013,,0.128908,0.541,,0.135187,0.223,
AvTemp,-0.0543,0.593,,0.005966,0.525,,-0.020682,0.0,,-0.001115,0.927,,-0.099435,0.198,,0.019292,0.637,
Precip,0.316361,0.328,,-0.017423,0.601,,0.028688,0.202,,-0.017586,0.752,,0.270755,0.409,,0.078233,0.353,
sigma2,23.901563,0.0,,0.306735,0.0,,0.238136,0.0,,0.194258,0.0,,11.686736,0.0,,2.825682,0.003,
Order,,,"(0, 0, 0)",,,"(1, 0, 0)",,,"(4, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(3, 0, 1)"
SeasonOrder,,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)",,,"(0, 0, 0, 0)"
AIC,,,336.638035,,,101.499338,,,96.944209,,,74.763929,,,298.002295,,,232.33408
BIC,,,348.571939,,,113.433242,,,116.834049,,,84.708849,,,309.9362,,,252.223921


It looks like, by and large, our ARIMA is NOT differencing - it's doing the KPSS test and not finding it.  Let's run it ourselves.





In [80]:
from statsmodels.tsa.stattools import kpss
def kpss_test(series, **kw):
    statistic, p_value, n_lags, critical_values = kpss(series, **kw)
    # Format Output
    print(f'KPSS Statistic: {statistic}')
    print(f'p-value: {p_value}')
    print(f'num lags: {n_lags}')
    print('Critial Values:')
    for key, value in critical_values.items():
        print(f'   {key} : {value}')
    print(f'Result: The series is {"not " if p_value < 0.05 else ""}stationary')


KPSS Statistic: 0.11801524349714164
p-value: 0.1
num lags: 11
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
Result: The series is stationary


look-up table. The actual p-value is greater than the p-value returned.



In [85]:
for crime in crimes:
    print(crime)
    kpss_test(combined[crime])

TNOs
KPSS Statistic: 0.11801524349714164
p-value: 0.1
num lags: 11
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
Result: The series is stationary
Drugs
KPSS Statistic: 0.1390808993304087
p-value: 0.1
num lags: 11
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
Result: The series is stationary
Robbery
KPSS Statistic: 0.4066690904884283
p-value: 0.07428056444464297
num lags: 11
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
Result: The series is stationary
SexualOffences
KPSS Statistic: 0.13086209565082815
p-value: 0.1
num lags: 11
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
Result: The series is stationary
Theft
KPSS Statistic: 0.12687686487212524
p-value: 0.1
num lags: 11
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
Result: The series is stationary
VAP
KPSS Statistic: 0.1913309982486865
p-value: 0.1
num lags: 11
Critial Values:
   10% : 0.347


look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.

look-up table. The actual p-value is greater than the p-value returned.



My conclusion from all this is that by removing weekdays from my dataset, we may have killed off seasonality, and thus made ARIMA largely pointless.

Let's start by just picking out what pdq orders auto arima picks out with and without an explicitly 2 period season, with no exogenous vars.


In [30]:
result_df = pd.DataFrame()
crimes = ['TNOs', 'Drugs', 'Robbery', 'SexualOffences', 'Theft', 'VAP']

#check fit for
for crime in crimes:
    auto = pm.auto_arima(combined[crime])
    predictions = auto.predict_in_sample()
    mse = mean_squared_error(combined[crime], predictions)
    result_df.loc["Order", crime+"_fit"] = str(auto.to_dict()['order'])
    result_df.loc["SeasonOrder", crime+"_fit"] = str(auto.to_dict()['seasonal_order'])
    result_df.loc["AIC", crime+"_fit"] = auto.aic()
    result_df.loc["BIC", crime+"_fit"] = auto.bic()
    result_df.loc["MSE", crime+"_fit"] = mse
    result_df.loc["RMSE", crime+"_fit"] = str(np.sqrt(mse))

result_df

Unnamed: 0,TNOs_fit,Drugs_fit,Robbery_fit,SexualOffences_fit,Theft_fit,VAP_fit
Order,"(0, 0, 0)","(0, 0, 0)","(0, 0, 0)","(0, 0, 0)","(0, 0, 0)","(0, 0, 1)"
SeasonOrder,"(0, 0, 0, 0)","(0, 0, 0, 0)","(0, 0, 0, 0)","(0, 0, 0, 0)","(0, 0, 0, 0)","(0, 0, 0, 0)"
AIC,336.004951,98.856578,101.415794,76.687767,299.420091,231.416577
BIC,339.982919,102.834547,105.393762,80.665735,303.398059,237.383529
MSE,27.395062,0.339163,0.355624,0.224966,13.91358,3.809016
RMSE,5.234029205917278,0.5823772293894959,0.5963423032631339,0.47430549906909997,3.730091184798782,1.9516700588748692


In [29]:
result_df = pd.DataFrame()
crimes = ['TNOs', 'Drugs', 'Robbery', 'SexualOffences', 'Theft', 'VAP']

#check fit for
for crime in crimes:
    auto = pm.auto_arima(combined[crime], m=2)
    predictions = auto.predict_in_sample()
    mse = mean_squared_error(combined[crime], predictions)
    result_df.loc["Order", crime+"_fit"] = str(auto.to_dict()['order'])
    result_df.loc["SeasonOrder", crime+"_fit"] = str(auto.to_dict()['seasonal_order'])
    result_df.loc["AIC", crime+"_fit"] = auto.aic()
    result_df.loc["BIC", crime+"_fit"] = auto.bic()
    result_df.loc["MSE", crime+"_fit"] = mse
    result_df.loc["RMSE", crime+"_fit"] = str(np.sqrt(mse))

result_df

Unnamed: 0,TNOs_fit,Drugs_fit,Robbery_fit,SexualOffences_fit,Theft_fit,VAP_fit
Order,"(1, 0, 1)","(0, 0, 0)","(0, 0, 0)","(0, 0, 0)","(0, 0, 0)","(0, 0, 1)"
SeasonOrder,"(1, 0, 1, 2)","(0, 0, 0, 2)","(1, 0, 0, 2)","(0, 0, 0, 2)","(0, 0, 0, 2)","(0, 0, 0, 2)"
AIC,333.86079,98.856578,99.044457,76.687767,299.420091,231.416577
BIC,345.794694,102.834547,105.011409,80.665735,303.398059,237.383529
MSE,23.161353,0.339163,0.327721,0.224966,13.91358,3.809016
RMSE,4.812624365610694,0.5823772293894959,0.5724689727002016,0.47430549906909997,3.730091184798782,1.9516700588748692


In [21]:
calc_weekend(" ~ 0 + OfficerDosage + AvTemp")

Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
intercept,13.213758,0.0,,0.335855,0.272,,1.043601,0.0,,,,,8.280988,0.0,,2.588048,0.0,
OfficerDosage,-0.1805,0.409,,0.014133,0.56,,0.009423,0.439,,0.017207,0.422,,-0.152867,0.337,,0.008304,0.899,
AvTemp,-0.073228,0.465,,-0.000437,0.973,,-0.020896,0.049,,0.005327,0.611,,-0.117246,0.095,,0.014881,0.658,
sigma2,26.678727,0.0,,0.323952,0.0,,0.297168,0.0,,0.225036,0.0,,13.017937,0.0,,3.788312,0.0,
Order,,,"(0, 0, 0)",,,"(1, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 1)"
SeasonOrder,,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(1, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)"
AIC,,,338.574234,,,102.43298,,,97.944586,,,78.702568,,,299.826318,,,235.215567
BIC,,,346.53017,,,112.377901,,,107.889506,,,84.669521,,,307.782255,,,245.160487
MSE,,,26.678767,,,0.324027,,,0.298043,,,0.225027,,,13.017753,,,3.794478
RMSE,,,5.165149265348742,,,0.5692339215991741,,,0.5459330175360142,,,0.4743705060998726,,,3.608012287330364,,,1.94794192487178


In [22]:

calc_weekend(" ~ 0 + OfficerDosage*C(Saturday) + AvTemp")

Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
C(Saturday)[False],8.207011,0.002,,0.404674,0.354,,0.360685,0.08,,0.065095,0.939,,5.123133,0.012,,1.4654,0.104,
C(Saturday)[True],14.723606,0.0,,0.147402,0.617,,0.940085,0.0,,-0.036522,0.958,,9.300552,0.0,,3.068561,0.0,
OfficerDosage,0.237345,0.395,,-0.01295,0.772,,0.049905,0.009,,0.007024,0.926,,0.113969,0.646,,0.114312,0.252,
OfficerDosage:C(Saturday)[T.True],-0.337322,0.415,,0.083905,0.167,,-0.067177,0.105,,0.07478,0.501,,-0.242915,0.474,,-0.070219,0.71,
AvTemp,-0.078379,0.414,,-0.000422,0.979,,-0.019327,0.017,,-0.000792,0.961,,-0.12023,0.074,,0.007962,0.595,
sigma2,22.186747,0.0,,0.317828,0.0,,0.265532,0.0,,0.196621,0.002,,11.287645,0.0,,2.556262,0.155,
Order,,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 1)"
SeasonOrder,,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 1, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(2, 0, 1, 2)"
AIC,,,332.618017,,,103.347153,,,96.019685,,,77.414099,,,296.125719,,,229.705965
BIC,,,344.551921,,,115.281057,,,109.942573,,,89.348003,,,308.059624,,,249.595805


In [23]:

calc_weekend(" ~ 0 + OfficerDosage:C(Saturday) + AvTemp")

Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
intercept,12.212777,0.0,,,,,0.737061,0.0,,,,,7.691164,0.0,,2.463179,0.002,
OfficerDosage:C(Saturday)[False],-0.161537,0.442,,0.02026,0.331,,0.009666,0.456,,0.013372,0.579,,-0.141701,0.363,,-0.003929,0.953,
OfficerDosage:C(Saturday)[True],0.355811,0.234,,0.089201,0.001,,0.030864,0.342,,0.074919,0.012,,0.163298,0.457,,0.110234,0.441,
AvTemp,-0.079305,0.402,,0.006427,0.492,,-0.020535,0.04,,-0.00067,0.955,,-0.120851,0.077,,0.012618,0.761,
sigma2,24.851874,0.0,,0.309859,0.0,,0.293121,0.0,,0.197269,0.0,,12.382717,0.0,,3.867789,0.0,
Order,,,"(0, 0, 0)",,,"(1, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)"
SeasonOrder,,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 1, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)"
AIC,,,336.743046,,,100.040975,,,99.215808,,,73.591975,,,299.12577,,,236.290722
BIC,,,346.687967,,,109.985895,,,111.149712,,,81.547911,,,309.07069,,,246.235642
MSE,,,24.851549,,,0.30986,,,0.294211,,,0.197264,,,12.382744,,,3.867823


In [27]:
calc_weekend(" ~ 0 + OfficerDosage")


Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
intercept,9.570915,0.055,,0.328827,0.134,,0.654896,0.0,,,,,6.756313,0.0,,2.78539,0.0,
OfficerDosage,0.089604,0.675,,0.01415,0.559,,0.011112,0.394,,0.025069,0.06,,-0.148236,0.382,,0.006996,0.914,
ar.L1,-0.999125,0.0,,-0.22795,0.3,,,,,,,,,,,,,
ma.L1,0.918743,0.0,,,,,,,,,,,,,,-0.212799,0.119,
ar.S.L2,0.556259,0.011,,,,,-0.283339,0.054,,,,,,,,,,
ma.S.L2,-0.906353,0.0,,,,,,,,,,,,,,,,
sigma2,21.022877,0.0,,0.323957,0.0,,0.324497,0.0,,0.22769,0.0,,13.562442,0.0,,3.801681,0.0,
Order,,,"(1, 0, 1)",,,"(1, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 1)"
SeasonOrder,,,"(1, 0, 1, 2)",,,"(0, 0, 0, 2)",,,"(1, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)"
AIC,,,335.000117,,,100.434584,,,100.637781,,,77.336563,,,300.040195,,,233.405687


In [28]:
calc_weekend(" ~ 0 + AvTemp")




Unnamed: 0,TNOs_params,TNOs_p,TNOs_fit,Drugs_params,Drugs_p,Drugs_fit,Robbery_params,Robbery_p,Robbery_fit,SexualOffences_params,SexualOffences_p,SexualOffences_fit,Theft_params,Theft_p,Theft_fit,VAP_params,VAP_p,VAP_fit
intercept,0.181951,0.901,,0.349602,0.171,,0.834137,0.0,,,,,7.355405,0.0,,2.638663,0.0,
AvTemp,-0.092542,0.445,,0.000176,0.991,,-0.020587,0.033,,0.011615,0.105,,-0.114822,0.099,,0.014723,0.662,
ar.S.L2,0.985425,0.0,,,,,,,,,,,,,,,,
ma.S.L2,-0.933573,0.002,,,,,-0.323665,0.04,,,,,,,,,,
sigma2,24.725164,0.0,,0.339156,0.0,,0.298378,0.0,,0.23188,0.0,,13.390834,0.0,,3.789326,0.0,
Order,,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 0)",,,"(0, 0, 1)"
SeasonOrder,,,"(1, 0, 1, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 1, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)",,,"(0, 0, 0, 2)"
AIC,,,337.505879,,,100.85633,,,96.157824,,,78.321617,,,299.352402,,,233.230861
BIC,,,347.450799,,,106.823282,,,104.11376,,,82.299585,,,305.319354,,,241.186797
MSE,,,25.497353,,,0.339162,,,0.299441,,,0.231876,,,13.390893,,,3.795726
