# Smoke Alarm install model
### this notebook represents the current smoke alarm install model

In [132]:
import pandas as pd
import os
import sys
import missingno as msno
from pathlib import Path
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np

In [133]:
 path = Path.cwd().parent.parent


In [134]:
# to allow for all variables to be displayed in jupyter
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [135]:
def StandardizeColumnNames(df):
    """
    Standardizes column names
    """
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace(', ', '_')
    df.columns = df.columns.str.replace('-', '_')
    df.columns = df.columns.str.replace('/', '_')
    df.columns = df.columns.str.replace('(', '_')
    df.columns = df.columns.str.replace(')', '_')
    df.columns = df.columns.str.replace(' ', '_')
    #print(df.columns)
    return df

### Data

In [136]:
input_loc =  path /'Data'/ 'Master Project Data'
output_loc = path /'Data'/ 'processed'

In [137]:
arc_path = input_loc / 'ARC Preparedness Data.csv'
arc = pd.read_csv(arc_path, 
                  dtype = {'GEOID': str, 'Zip': str})
arc = StandardizeColumnNames(arc)
arc.dropna(inplace = True)
# trim geoid leading saftey marks 
arc['geoid'] = arc['geoid'].str[2:]
arc.head()

Unnamed: 0,geoid,census_block_group_y,census_block_group_x,city,state,zip,county,in_home_visit_date,smoke_alarms_installed__9_volt_10_year_dhh_,10_year_and_9_volt_alarms_installed,dhh_alarms_installed,pre_existing_alarms,pre_existing_alarms_tested_and_working,batteries_replaced,fire_escape_plans_made,fire_safety_checklists_completed,additional_hazard_education_conducted,additional_hazard_type,people_served,youth_served,seniors_served,veterans_military_members_and_military_family_members_served,individuals_with_disabilities_access_or_functional_needs_served
12,10010205002,32.470418,-86.424166,PRATTVILLE,AL,36066,Autauga,9/9/2016,1,1,0,0.0,0.0,0,1,1,0,,2,0,0,0,0
18,10010208012,32.455173,-86.534591,PRATTVILLE,AL,36067,Autauga,9/30/2019,2,2,0,0.0,0.0,0,1,1,1,Tornadoes,5,0,0,1,0
23,10010208021,32.524822,-86.573009,PRATTVILLE,AL,36067,Autauga,5/4/2019,3,2,1,0.0,0.0,0,1,1,1,Other,1,0,0,0,0
24,10010208021,32.524822,-86.573009,PRATTVILLE,AL,36067,Autauga,9/27/2019,1,1,0,0.0,0.0,0,1,1,1,Tornadoes,1,0,0,0,0
27,10010208023,32.543983,-86.4921,PRATTVILLE,AL,36067,Autauga,12/2/2017,3,3,0,0.0,0.0,0,1,1,0,,1,0,0,0,0


## EDA  

- remove all houses that don't have a previous smoke detector record 
- Determine the median number of house visist
- Visualize visit distribution 
- use ACS data to determine % of blocks visited 
- determine % blocks visited with >15 visits 

In [138]:
#block level
counts = arc['geoid'].value_counts()
counts_median = counts.median()
counts.describe()


count    87725.000000
mean         6.132619
std         15.250533
min          1.000000
25%          1.000000
50%          2.000000
75%          6.000000
max       1863.000000
Name: geoid, dtype: float64

## EDA- Geograpic Level
 repeat block Level analysis at various levels

In [139]:
# county
county_counts =  arc['geoid'].str[:7].value_counts()
print('County Level')
print(county_counts.describe())
# state 
state_counts =  arc['geoid'].str[:2].value_counts()
print('\n State Level')
print(state_counts.describe())

County Level
count    3605.000000
mean      149.232732
std       331.121218
min         1.000000
25%        10.000000
50%        47.000000
75%       148.000000
max      6579.000000
Name: geoid, dtype: float64

 State Level
count       52.000000
mean     10345.846154
std      12099.256714
min          3.000000
25%       2130.500000
50%       7135.500000
75%      12123.000000
max      51826.000000
Name: geoid, dtype: float64


In [140]:
print(arc['pre_existing_alarms'].describe())
print(arc['pre_existing_alarms_tested_and_working'].describe())

count    537984.000000
mean          1.507301
std           1.552540
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max           9.000000
Name: pre_existing_alarms, dtype: float64
count    537984.000000
mean          0.829638
std           1.336587
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           9.000000
Name: pre_existing_alarms_tested_and_working, dtype: float64


## Confidence Interval Motivation 

A commonly used formula for a binomial confidence interval relies on approximating the distribution of error about a binomially-distributed observation, ${\displaystyle {\hat {p}}}$, with a normal distribution. This approximation is based on the central limit theorem and is unreliable when the sample size is small or the success probability is close to 0 or 1.

Using the normal approximation, the success probability p is estimated as

${\displaystyle {\hat {p}}\pm z{\sqrt {\frac {{\hat {p}}\left(1-{\hat {p}}\right)}{n}}},}$

Source https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval

In [141]:
def CreateConfidenceIntervals(num_surveys,percentage):
# this function takes the cleaned data and adds a confidence interval 

    z =	1.960 # corresponds to 95% confidence interval
    
    CI =  z * np.sqrt(
                     (percentage * (100 - percentage) ) / 
                      num_surveys  )

    return CI


### Feature Engineering
 Create Binary variables out of the smoke_alarms_present and smoke_alarms_tested_and_working variables 
 
 We'll then create a new dataset with the aggregated number and percantage of working smoke detectors in each census geography

In [142]:
def CreateDetectorDataFrame(df,geo_level):
# This function takes the arc data  into a dataset containing the percentage 
# and number of smoke detectors by census geography
#
# Inputs 
# arc-  the arc dataset
#
# geo_level- String var indcating what census geography to aggregate on. current levels are:
# State,County,Block,State
#
# The resultant dataset will have the following values:
#
#   num_surveys - total number of surveys conducted
#
#   detectors_found -   houses with at least one smoke detector in the home
#
#   detectors_workding - houses with at least one tested and working smoke detector in the home
#
#   Note: for variables the suffixes 
#       _total- indicates raw counts 
#        _prc  - indicates percentage: (_total / num_surveys * 100)
#
   
    # dict with relevant length of GEOID for tract geography
    geo_level_dict = {'State':2,'County':5,'Tract':11,'Block':12}
    
    df['geoid'] = df['geoid'].str[: geo_level_dict[geo_level]]
    
    ## binarize pre_existing_alarms and _tested_and_working
    #  values will now be: 0 if no detectors present and 1 if any number were present
    df['pre_existing_alarms'].where(df['pre_existing_alarms'] < 1, other = 1, inplace = True) 
    df['pre_existing_alarms_tested_and_working'].where(
                                                        df['pre_existing_alarms_tested_and_working'] < 1,
                                                            other = 1, 
                                                            inplace = True)

    ## create detectors dataset
    # This happens by grouping data both on pre_existing alarms and then _tested_and working alarms 
    # and then merging the two into the final dataset

    detectors =  df.groupby('geoid')['pre_existing_alarms'].agg({np.size ,
                                                                  np.sum,
                                                                  lambda x: np.sum(x)/np.size(x)* 100 })

    detectors.rename({'size':'num_surveys','sum':'detectors_found_total','<lambda_0>':'detectors_found_prc'},
                     axis =1,
                     inplace = True)

    detectors['detectors_found_prc'] = detectors['detectors_found_prc'].round(2)
    
  
    
    d2 =  df.groupby('geoid')['pre_existing_alarms_tested_and_working'].agg({np.sum, 
                                                                              lambda x: np.sum(x)/np.size(x)* 100 })
    d2.columns = ['detectors_working_total','detectors_working_prc']
    
    d2['detectors_working_prc'] = d2['detectors_working_prc'].round(2)
    

    detectors = detectors.merge(d2,how = 'left', on ='geoid')

    detectors['detectors_found_CI'] = CreateConfidenceIntervals(detectors['num_surveys'].values,
                                                                detectors['detectors_found_prc'].values )
                                                                
    detectors['detectors_working_CI'] = CreateConfidenceIntervals(detectors['num_surveys'].values,
                                                                detectors['detectors_working_prc'].values )  
    
    
    
    
    
    # rearrange columns 
    column_order = ['num_surveys',	
                    'detectors_found_total',
                    'detectors_found_prc', 
                    'detectors_found_CI',
                    'detectors_working_total',
                    'detectors_working_prc',
                    'detectors_working_CI']
    
    detectors = detectors[column_order]


    return detectors

In [145]:
arc_state = CreateDetectorDataFrame(arc.copy(),'State')
arc_state.describe()

Unnamed: 0,num_surveys,detectors_found_total,detectors_found_prc,detectors_found_CI,detectors_working_total,detectors_working_prc,detectors_working_CI
count,52.0,52.0,52.0,52.0,52.0,52.0,52.0
mean,10345.846154,6921.692308,67.455577,2.386592,4133.326923,39.017692,1.454156
std,12099.256714,8422.896543,10.651345,7.258363,5520.523777,11.267276,0.963322
min,3.0,2.0,38.53,0.384964,0.0,0.0,0.0
25%,2130.5,1649.0,59.42,0.818981,940.0,31.7925,0.80342
50%,7135.5,4122.0,67.025,1.128645,2272.0,38.555,1.096192
75%,12123.0,7902.75,77.3075,1.679617,4533.5,47.905,2.008385
max,51826.0,34691.0,85.57,53.34311,26754.0,66.24,4.536868


In [146]:
arc_county = CreateDetectorDataFrame(arc.copy(),'County')
arc_county.describe()

Unnamed: 0,num_surveys,detectors_found_total,detectors_found_prc,detectors_found_CI,detectors_working_total,detectors_working_prc,detectors_working_CI
count,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0,2655.0
mean,202.630508,135.566102,63.680746,12.448032,80.954049,36.63232,12.752196
std,587.39427,408.184611,23.90261,12.592041,273.065209,22.784949,13.077502
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13.0,7.0,50.0,4.359599,3.0,21.58,4.691414
50%,56.0,33.0,66.84,8.873665,17.0,34.86,8.868717
75%,167.0,110.0,80.0,16.287997,61.0,50.0,15.825149
max,12981.0,8841.0,100.0,69.296465,7257.0,100.0,69.296465


In [147]:
arc_tract = CreateDetectorDataFrame(arc.copy(),'Tract')
arc_tract.describe()

Unnamed: 0,num_surveys,detectors_found_total,detectors_found_prc,detectors_found_CI,detectors_working_total,detectors_working_prc,detectors_working_CI
count,43097.0,43097.0,43097.0,43097.0,43097.0,43097.0,43097.0
mean,12.483096,8.351579,67.079811,17.525276,4.987192,38.56296,19.247532
std,33.851003,25.877164,33.067725,20.152982,19.321604,33.653914,20.693728
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1.0,50.0,0.0,0.0,0.0,0.0
50%,4.0,3.0,75.0,12.163118,1.0,33.33,14.779572
75%,13.0,8.0,100.0,29.822836,4.0,60.0,33.465899
max,4298.0,3394.0,100.0,69.296465,2765.0,100.0,69.296465


In [148]:
arc_block = CreateDetectorDataFrame(arc.copy(),'Block')
arc_block.describe()

Unnamed: 0,num_surveys,detectors_found_total,detectors_found_prc,detectors_found_CI,detectors_working_total,detectors_working_prc,detectors_working_CI
count,87725.0,87725.0,87725.0,87725.0,87725.0,87725.0,87725.0
mean,6.132619,4.102913,66.982308,15.831853,2.450077,38.402308,17.390355
std,15.250533,11.647201,36.939341,22.119806,8.602772,37.814582,22.795874
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,50.0,0.0,0.0,0.0,0.0
50%,2.0,1.0,80.0,0.0,1.0,33.33,0.0
75%,6.0,4.0,100.0,30.006249,2.0,66.67,33.548007
max,1863.0,1546.0,100.0,69.296465,1325.0,100.0,69.296465


In [149]:
arc

Unnamed: 0,geoid,census_block_group_y,census_block_group_x,city,state,zip,county,in_home_visit_date,smoke_alarms_installed__9_volt_10_year_dhh_,10_year_and_9_volt_alarms_installed,dhh_alarms_installed,pre_existing_alarms,pre_existing_alarms_tested_and_working,batteries_replaced,fire_escape_plans_made,fire_safety_checklists_completed,additional_hazard_education_conducted,additional_hazard_type,people_served,youth_served,seniors_served,veterans_military_members_and_military_family_members_served,individuals_with_disabilities_access_or_functional_needs_served
12,010010205002,32.470418,-86.424166,PRATTVILLE,AL,36066,Autauga,9/9/2016,1,1,0,0.0,0.0,0,1,1,0,,2,0,0,0,0
18,010010208012,32.455173,-86.534591,PRATTVILLE,AL,36067,Autauga,9/30/2019,2,2,0,0.0,0.0,0,1,1,1,Tornadoes,5,0,0,1,0
23,010010208021,32.524822,-86.573009,PRATTVILLE,AL,36067,Autauga,5/4/2019,3,2,1,0.0,0.0,0,1,1,1,Other,1,0,0,0,0
24,010010208021,32.524822,-86.573009,PRATTVILLE,AL,36067,Autauga,9/27/2019,1,1,0,0.0,0.0,0,1,1,1,Tornadoes,1,0,0,0,0
27,010010208023,32.543983,-86.492100,PRATTVILLE,AL,36067,Autauga,12/2/2017,3,3,0,0.0,0.0,0,1,1,0,,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862041,560459513003,43.843551,-104.260072,NEWCASTLE,WY,82701,Weston,3/4/2017,2,2,0,5.0,3.0,0,1,1,1,Wildfires,1,0,0,0,0
862042,560459513003,43.843551,-104.260072,NEWCASTLE,WY,82701,Weston,3/4/2017,3,3,0,0.0,0.0,0,1,1,1,Wildfires,4,2,0,0,0
862043,560459513003,43.843551,-104.260072,NEWCASTLE,WY,82701,Weston,3/4/2017,3,3,0,5.0,0.0,0,1,1,1,Wildfires,2,0,1,0,1
862044,560459513003,43.843551,-104.260072,NEWCASTLE,WY,82701,Weston,3/4/2017,4,4,0,1.0,0.0,0,1,1,1,Wildfires,3,0,0,0,0
