In [1]:
# Import and configuration
from functools import reduce
import numpy as np
import os
import pandas as pd

from config import *

In [4]:
# Load clean data from CSV
raw_data = pd.read_csv(incidents_clean_data_path)
# For the purposes of this analysis we don't care about incidents with no characteristics
total_count = len(raw_data.index)
raw_data = raw_data.dropna(how='any',subset=['Incident Info']).reset_index(drop=True)
print(f'There were {total_count - len(raw_data.index)} out of {total_count} incidents with no characteristics')
raw_data.head()

There were 326 out of 239677 incidents with no characteristics


Unnamed: 0,Year,State Name,Number of Killed,Number of Injured,Incident Info,Latitude,Longitude,Participant Age,Participant Age Group,Participant Gender,Participant Relationship,Participant Status,Participant Type,State Abbr
0,2013,Pennsylvania,0,4,Shot - Wounded/Injured||Mass Shooting (4+ vict...,40.3467,-79.8559,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,PA
1,2013,California,1,3,"Shot - Wounded/Injured||Shot - Dead (murder, a...",33.909,-118.333,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,CA
2,2013,Ohio,1,3,"Shot - Wounded/Injured||Shot - Dead (murder, a...",41.4455,-82.1377,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,OH
3,2013,Colorado,4,0,"Shot - Dead (murder, accidental, suicide)||Off...",39.6518,-104.802,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,CO
4,2013,North Carolina,2,2,"Shot - Wounded/Injured||Shot - Dead (murder, a...",36.114,-79.9569,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,NC


In [7]:
# There are several columns that we don't use for this analysis so lets drop them
clean_data = raw_data.drop(columns=[
    'State Name',
    'Participant Age', 
    'Participant Age Group', 
    'Participant Gender',
    'Participant Relationship',
    'Participant Status',
    'Participant Type'
])
clean_data.head()

Unnamed: 0,Year,Number of Killed,Number of Injured,Incident Info,Latitude,Longitude,State Abbr
0,2013,0,4,Shot - Wounded/Injured||Mass Shooting (4+ vict...,40.3467,-79.8559,PA
1,2013,1,3,"Shot - Wounded/Injured||Shot - Dead (murder, a...",33.909,-118.333,CA
2,2013,1,3,"Shot - Wounded/Injured||Shot - Dead (murder, a...",41.4455,-82.1377,OH
3,2013,4,0,"Shot - Dead (murder, accidental, suicide)||Off...",39.6518,-104.802,CO
4,2013,2,2,"Shot - Wounded/Injured||Shot - Dead (murder, a...",36.114,-79.9569,NC


In [16]:
# First lets split incident charactericis field and keep values in the sets (for faster operations)
empty_value_set = {''}
split_inc_info_data = clean_data.copy()
split_inc_info_data['Incident Info'] = split_inc_info_data['Incident Info'].apply(lambda x: set(x.split('|')).difference(empty_value_set))
split_inc_info_data.head()

Unnamed: 0,Year,Number of Killed,Number of Injured,Incident Info,Latitude,Longitude,State Abbr
0,2013,0,4,{Possession (gun(s) found during commission of...,40.3467,-79.8559,PA
1,2013,1,3,{Mass Shooting (4+ victims injured or killed e...,33.909,-118.333,CA
2,2013,1,3,"{Shot - Wounded/Injured, Shot - Dead (murder, ...",41.4455,-82.1377,OH
3,2013,4,0,"{Drug involvement, Shot - Dead (murder, accide...",39.6518,-104.802,CO
4,2013,2,2,"{Domestic Violence, Suicide^, Shot - Dead (mur...",36.114,-79.9569,NC


In [17]:
# Lets first take a look at all existing categories of the incidents (one incident can belong to multiple categories)
distinct_categories = reduce(lambda x,y: x.union(y), split_inc_info_data['Incident Info'])
print(f'There are {len(distinct_categories)} unique categories')
distinct_categories

There are 109 unique categories


{'ATF/LE Confiscation/Raid/Arrest',
 'Accidental Shooting',
 'Accidental Shooting - Death',
 'Accidental Shooting - Injury',
 'Accidental Shooting at a Business',
 'Accidental/Negligent Discharge',
 'Animal shot/killed',
 'Armed robbery with injury/death and/or evidence of DGU found',
 'Assault weapon (AR-15, AK-47, and ALL variants defined by law enforcement)',
 'Attempted Murder/Suicide (one variable unsuccessful)',
 'BB/Pellet/Replica gun',
 'Bar/club incident - in or around establishment',
 'Brandishing/flourishing/open carry/lost/found',
 'Car-jacking',
 'Child Involved Incident',
 'Child injured (not child shooter)',
 'Child injured by child',
 'Child injured self',
 'Child killed (not child shooter)',
 'Child killed by child',
 'Child killed self',
 'Child picked up & fired gun',
 'Child with gun - no shots fired',
 'Cleaning gun',
 'Concealed Carry License - Perpetrator',
 'Concealed Carry License - Victim',
 'Criminal act with stolen gun',
 'Defensive Use',
 'Defensive Use - C

In [19]:
# Now lets take a look at the statistics for each category
category_related_data = {
    'Category': [],
    'Number of Incidents': [],
    'Total Killed': [],
    'Total Injured': []
}
for category in distinct_categories:
    category_related_data['Category'].append(category)
    filter_series = split_inc_info_data['Incident Info'].apply(lambda x: category in x)
    category_related_incidents = split_inc_info_data.loc[filter_series,:]
    category_related_data['Number of Incidents'].append(len(category_related_incidents.index))
    category_related_data['Total Killed'].append(category_related_incidents['Number of Killed'].sum())
    category_related_data['Total Injured'].append(category_related_incidents['Number of Injured'].sum())
category_related_data = pd.DataFrame(category_related_data)
category_related_data.head()

Unnamed: 0,Category,Number of Incidents,Total Killed,Total Injured
0,NAV,2,0,1
1,Home Invasion - Resident killed,964,1145,370
2,Self-Inflicted (not suicide or suicide attempt...,1887,333,1699
3,Child picked up & fired gun,607,195,393
4,Shootout (where VENN diagram of shooters and v...,2331,1085,2256
