In [5]:
import json
import psycopg2
import pandas as pd

#this is a .json with my credentials, see also update-department-boundaries...
#for a more convenient approach using your pg_conf file
#with open('../config.json') as f:
#    conf = json.load(f)


In [6]:
#conn_str = "host={host} dbname={database} user={user} password={passw} port={port}".format(**conf)

In [7]:
conn = psycopg2.connect('service=firecares-austin')

In [8]:
bigquery = """select r.department_id, round(s.yravg,2) as yravg_fires, round(r.risk_model_fires::numeric,2) as predicted_fires, 
round((s.yravg - r.risk_model_fires)::numeric,2) as residuals,
round((abs(s.yravg - r.risk_model_fires)/(s.yravg))::numeric,2) as normedresiduals, 
fd.name from 
firestation_firedepartmentriskmodels r
	inner join (select fire_department_id, avg(count) as yravg from firestation_nfirsstatistic where metric='residential_structure_fires'
and count is not null and year >= 2010 and level = 0 group by fire_department_id) s
	on r.department_id = s.fire_department_id
	inner join firestation_firedepartment fd on r.department_id = fd.id
where r.level = 0 and r.risk_model_fires is not null order by normedresiduals asc
"""

In [9]:
df = pd.read_sql(bigquery,con=conn)

In [10]:
df.head()

Unnamed: 0,department_id,yravg_fires,predicted_fires,residuals,normedresiduals,name
0,73695,83.33,83.06,0.27,0.0,Anniston Fire Department
1,85843,52.43,52.21,0.22,0.0,Klamath County Fire District 1
2,77786,108.71,108.23,0.48,0.0,City of Murfreesboro Fire & Rescue Department ...
3,94133,293.57,292.11,1.46,0.0,Saint Petersburg Fire & Rescue Station
4,77489,554.71,555.84,-1.12,0.0,Cincinnati Fire Department


In [7]:
# 90% of fire departments reporting more than 100 incidents have predicted values within about 600 incidents of the true value.
df.residuals.describe()

count     2578.000000
mean       -37.936955
std        604.690780
min     -29973.210000
25%        -22.060000
50%         -9.265000
75%         -0.142500
max       1623.010000
Name: residuals, dtype: float64

In [9]:
df.normedresiduals.describe()

count    2578.000000
mean        1.175710
std         7.670356
min         0.000000
25%         0.320000
50%         0.730000
75%         1.060000
max       371.040000
Name: normedresiduals, dtype: float64

In [12]:
df[(df.residuals < 1000) & (df.residuals > -1000) & (df.yravg_fires > 10)].head()

Unnamed: 0,department_id,yravg_fires,predicted_fires,residuals,normedresiduals,name
0,73695,83.33,83.06,0.27,0.0,Anniston Fire Department
1,85843,52.43,52.21,0.22,0.0,Klamath County Fire District 1
2,77786,108.71,108.23,0.48,0.0,City of Murfreesboro Fire & Rescue Department ...
3,94133,293.57,292.11,1.46,0.0,Saint Petersburg Fire & Rescue Station
4,77489,554.71,555.84,-1.12,0.0,Cincinnati Fire Department


In [14]:
df[(abs(df.normedresiduals) < 0.5)].head()

Unnamed: 0,department_id,yravg_fires,predicted_fires,residuals,normedresiduals,name
0,73695,83.33,83.06,0.27,0.0,Anniston Fire Department
1,85843,52.43,52.21,0.22,0.0,Klamath County Fire District 1
2,77786,108.71,108.23,0.48,0.0,City of Murfreesboro Fire & Rescue Department ...
3,94133,293.57,292.11,1.46,0.0,Saint Petersburg Fire & Rescue Station
4,77489,554.71,555.84,-1.12,0.0,Cincinnati Fire Department


In [51]:
cumulator = df.apply(lambda x: x.name/df.shape[0],axis=1)
df = df.assign(cumulativefreq=cumulator)

In [33]:
import holoviews as hv
hv.extension('matplotlib')

In [46]:
import numpy as np

out=hv.Histogram(np.histogram(df.normedresiduals[df.normedresiduals <= 3],range=(0,3)))

In [47]:
out = out.opts(title='Fire Departments by Model Residual Error',
               xlabel='Normalized Residual Error',
               ylabel='# of FDs',
               fontsize={'title':16,'xlabel':16,'ylabel':16,'xticks':14,'yticks':14},
               fig_size=200)

In [48]:
hv.save(out,'FDresidualhistogram.png')

In [54]:
residtable = hv.Table(df[['normedresiduals','cumulativefreq']],vdims='cumulativefreq')

In [61]:
hv.Curve(residtable).opts(title='Cumulative Normalized Model Residuals',
                          xlabel='Normalized Model Residuals',
                          ylabel='Cumulative Proportion of FDs',
                          xlim=(0,5),
                          fig_size=125)