In [1]:
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
import numpy as np

import helper

init_notebook_mode(connected=True)

In [2]:
# Load dataset
path = "gun-violence-data.csv"
df=helper.load_data(path)

In [3]:
# Print columns
print(df.keys())

Index(['incident_id', 'date', 'state', 'city_or_county', 'address', 'n_killed',
       'n_injured', 'incident_url', 'source_url',
       'incident_url_fields_missing', 'congressional_district', 'gun_stolen',
       'gun_type', 'incident_characteristics', 'latitude',
       'location_description', 'longitude', 'n_guns_involved', 'notes',
       'participant_age', 'participant_age_group', 'participant_gender',
       'participant_name', 'participant_relationship', 'participant_status',
       'participant_type', 'sources', 'state_house_district',
       'state_senate_district', 'year', 'month', 'monthday', 'weekday',
       'loss'],
      dtype='object')


In [4]:
# Visualize num of people killed, injured, both

print("Total number of incidents = {}".format(len(df['n_killed'])))

temp=[('n_killed','Number of People Killed'),('n_injured','Number of People Injured'),
     ('loss','Number of People Killed/Injured')]

for column,title in temp:
    labels,values=helper.get_bucketed_data(df,column,3)
    helper.plot_pie(labels,values,title)

Total number of incidents = 239677


In [5]:
# Print top 5 serious incidents

print("\n\nThe five most serious incidents (in terms of killed+injured)".upper())

df1 = df.sort_values(['loss'], ascending=[False])
df1[['year', 'state', 'city_or_county', 'n_killed', 'n_injured']].head(5)



THE FIVE MOST SERIOUS INCIDENTS (IN TERMS OF KILLED+INJURED)


Unnamed: 0,year,state,city_or_county,n_killed,n_injured
130448,2016,Florida,Orlando,50,53
217151,2017,Texas,Sutherland Springs,27,20
101531,2015,California,San Bernardino,16,19
232745,2018,Florida,Pompano Beach (Parkland),17,17
70511,2015,Texas,Waco,9,18


In [6]:
# Visualize age distribution of suspects and victims

for target_type in ['suspect','victim']:      
    age_groups=helper.get_age_distribution(df['participant_type'],df['participant_age'],target_type)
    helper.plot_histogram(age_groups,dict(range=[0, 100]),target_type+' age histogram')

In [7]:
# Visualize distribution of participant types

types=['ARRESTED','INJURED','KILLED','UNHARMED']
values=helper.get_person_type_counts(df,"participant_status",types)
helper.plot_pie(types,values,'Participant Type')

In [8]:
# Visualize how num of victims and num of guns varies with num of suspects

p_type = df["participant_type"].str.replace("[::0-9|,]","").str.upper()
guns = df['n_guns_involved'][p_type.notnull()]
p_type = p_type[p_type.notnull()]
p_type = pd.DataFrame(p_type)
victims  = p_type["participant_type"].str.count("VICTIM")
suspects = p_type["participant_type"].str.count("SUBJECT-SUSPECT")

x,y1=helper.get_mean_vs_data(guns,suspects,9)
x,y2=helper.get_mean_vs_data(victims,suspects,9)

temp=dict(zip([str(i) for i in x],y1))
helper.plot_histogram(temp,dict(range=[1,10]),'Mean num of victims vs num of suspects')

temp=dict(zip([str(i) for i in x],y2))
helper.plot_histogram(temp,dict(range=[1,10]),'Mean num of guns vs num of suspects')