In [1]:
# Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

DATA_PATH = '.'

def load_data(data_path=DATA_PATH, filename="gun-violence-data_01-2013_03-2018.csv"):
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path, error_bad_lines=False)

poverty_data = load_data(DATA_PATH, "poverty.csv")
education_data = load_data(DATA_PATH, "education.csv")
unemployment_data = load_data(DATA_PATH, "unemployment.csv")
population_data = load_data(DATA_PATH, "population.csv")
# drop states and US because we want to get the data by county
poverty_data = poverty_data.dropna(subset=["Rural-urban_Continuum_Code_2013", "Urban_Influence_Code_2013"]) 
education_data = education_data.dropna(subset=["2013 Rural-urban Continuum Code", "2013 Urban Influence Code"])
unemployment_data = unemployment_data.dropna(subset=["Rural_urban_continuum_code_2013", "Urban_influence_code_2013"])
population_data = population_data.dropna(subset=["Rural-urban_Continuum Code_2013", "Urban_Influence_Code_2013","POP_ESTIMATE_2017",
                                                "POP_ESTIMATE_2016","POP_ESTIMATE_2015","POP_ESTIMATE_2014","POP_ESTIMATE_2013"])

poverty_data.head()

Unnamed: 0,FIPStxt,State,Area_Name,Rural-urban_Continuum_Code_2003,Urban_Influence_Code_2003,Rural-urban_Continuum_Code_2013,Urban_Influence_Code_2013,POVALL_2016,CI90LBAll_2016,CI90UBALL_2016,...,CI90UB517P_2016,MEDHHINC_2016,CI90LBINC_2016,CI90UBINC_2016,POV05_2016,CI90LB05_2016,CI90UB05_2016,PCTPOV05_2016,CI90LB05P_2016,CI90UB05P_2016
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,7444,6255,8633,...,22.0,54487,50886,58088,,,,,,
3,1003,AL,Baldwin County,4.0,5.0,3.0,2.0,24005,20132,27878,...,20.0,56460,53250,59670,,,,,,
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,6787,5551,8023,...,45.0,32884,29684,36084,,,,,,
5,1007,AL,Bibb County,1.0,1.0,1.0,1.0,4099,3194,5004,...,33.0,43079,38896,47262,,,,,,
6,1009,AL,Blount County,1.0,1.0,1.0,1.0,8033,6506,9560,...,22.0,47213,43017,51409,,,,,,


In [2]:
education_data.head()

Unnamed: 0,FIPS Code,State,Area name,2003 Rural-urban Continuum Code,2003 Urban Influence Code,2013 Rural-urban Continuum Code,2013 Urban Influence Code,"Less than a high school diploma, 1970","High school diploma only, 1970","Some college (1-3 years), 1970",...,"Percent of adults completing some college or associate's degree, 2000","Percent of adults with a bachelor's degree or higher, 2000","Less than a high school diploma, 2012-2016","High school diploma only, 2012-2016","Some college or associate's degree, 2012-2016","Bachelor's degree or higher, 2012-2016","Percent of adults with less than a high school diploma, 2012-2016","Percent of adults with a high school diploma only, 2012-2016","Percent of adults completing some college or associate's degree, 2012-2016","Percent of adults with a bachelor's degree or higher, 2012-2016"
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,6611,3757,933,...,26.9,18.0,4528,12519,10451,8968,12.4,34.3,28.7,24.6
3,1003,AL,Baldwin County,4.0,5.0,3.0,2.0,18726,8426,2334,...,29.3,23.1,13956,40154,44486,41350,10.0,28.7,31.8,29.5
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,8120,2242,581,...,21.3,10.9,4824,6422,4775,2366,26.2,34.9,26.0,12.9
5,1007,AL,Bibb County,1.0,1.0,1.0,1.0,5272,1402,238,...,20.4,7.1,3040,6586,4234,1890,19.3,41.8,26.9,12.0
6,1009,AL,Blount County,1.0,1.0,1.0,1.0,10677,3440,626,...,24.8,9.6,7882,13003,13436,5151,20.0,32.9,34.0,13.1


In [3]:
unemployment_data.head()

Unnamed: 0,FIPStxt,State,Area_name,Rural_urban_continuum_code_2013,Urban_influence_code_2013,Metro_2013,Civilian_labor_force_2007,Employed_2007,Unemployed_2007,Unemployment_rate_2007,...,Civilian_labor_force_2016,Employed_2016,Unemployed_2016,Unemployment_rate_2016,Civilian_labor_force_2017,Employed_2017,Unemployed_2017,Unemployment_rate_2017,Median_Household_Income_2016,Med_HH_Income_Percent_of_State_Total_2016
2,1001,AL,"Autauga County, AL",2.0,2.0,1.0,24383,23577,806,3.3,...,25918,24593,1325,5.1,25909,24908,1001,3.9,"$54,487",117.7
3,1003,AL,"Baldwin County, AL",3.0,2.0,1.0,82659,80099,2560,3.1,...,90500,85656,4844,5.4,91567,87915,3652,4.0,"$56,460",121.9
4,1005,AL,"Barbour County, AL",6.0,6.0,0.0,10334,9684,650,6.3,...,8402,7700,702,8.4,8236,7750,486,5.9,"$32,884",71.0
5,1007,AL,"Bibb County, AL",1.0,1.0,1.0,8791,8432,359,4.1,...,8607,8050,557,6.5,8506,8133,373,4.4,"$43,079",93.0
6,1009,AL,"Blount County, AL",1.0,1.0,1.0,26629,25780,849,3.2,...,24576,23248,1328,5.4,24494,23509,985,4.0,"$47,213",102.0


In [4]:
# drop the state in "Area_name"
unemployment_data["Area_name"] =  unemployment_data["Area_name"].map(lambda x: x.split(',',1)[0])
unemployment_data.head()

Unnamed: 0,FIPStxt,State,Area_name,Rural_urban_continuum_code_2013,Urban_influence_code_2013,Metro_2013,Civilian_labor_force_2007,Employed_2007,Unemployed_2007,Unemployment_rate_2007,...,Civilian_labor_force_2016,Employed_2016,Unemployed_2016,Unemployment_rate_2016,Civilian_labor_force_2017,Employed_2017,Unemployed_2017,Unemployment_rate_2017,Median_Household_Income_2016,Med_HH_Income_Percent_of_State_Total_2016
2,1001,AL,Autauga County,2.0,2.0,1.0,24383,23577,806,3.3,...,25918,24593,1325,5.1,25909,24908,1001,3.9,"$54,487",117.7
3,1003,AL,Baldwin County,3.0,2.0,1.0,82659,80099,2560,3.1,...,90500,85656,4844,5.4,91567,87915,3652,4.0,"$56,460",121.9
4,1005,AL,Barbour County,6.0,6.0,0.0,10334,9684,650,6.3,...,8402,7700,702,8.4,8236,7750,486,5.9,"$32,884",71.0
5,1007,AL,Bibb County,1.0,1.0,1.0,8791,8432,359,4.1,...,8607,8050,557,6.5,8506,8133,373,4.4,"$43,079",93.0
6,1009,AL,Blount County,1.0,1.0,1.0,26629,25780,849,3.2,...,24576,23248,1328,5.4,24494,23509,985,4.0,"$47,213",102.0


In [5]:
population_data.head()

Unnamed: 0,FIPS,State,Area_Name,Rural-urban_Continuum Code_2003,Rural-urban_Continuum Code_2013,Urban_Influence_Code_2003,Urban_Influence_Code_2013,Economic_typology_2015,CENSUS_2010_POP,ESTIMATES_BASE_2010,...,R_DOMESTIC_MIG_2015,R_DOMESTIC_MIG_2016,R_DOMESTIC_MIG_2017,R_NET_MIG_2011,R_NET_MIG_2012,R_NET_MIG_2013,R_NET_MIG_2014,R_NET_MIG_2015,R_NET_MIG_2016,R_NET_MIG_2017
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,0.0,54571,54571,...,-2.0,4.8,1.0,5.9,-6.1,-4.1,2.1,-1.7,5.1,1.3
3,1003,AL,Baldwin County,4.0,3.0,5.0,2.0,5.0,182265,182265,...,17.0,20.5,22.4,16.3,17.2,22.6,20.4,17.9,21.3,23.2
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,3.0,27457,27457,...,-16.2,-18.8,-19.0,0.3,-6.8,-8.0,-5.5,-16.4,-18.9,-19.2
5,1007,AL,Bibb County,1.0,1.0,1.0,1.0,0.0,22915,22919,...,0.9,-1.4,-0.9,-5.0,-4.1,-5.9,1.2,1.8,-0.5,0.0
6,1009,AL,Blount County,1.0,1.0,1.0,1.0,0.0,57322,57324,...,-1.6,-1.7,6.2,0.2,-1.4,-0.5,-1.8,-0.5,-0.7,7.3


In [6]:
# drop the ',' in values
population_data["POP_ESTIMATE_2017"] = population_data["POP_ESTIMATE_2017"].map(lambda x: x.replace(",",''))
population_data["POP_ESTIMATE_2016"] = population_data["POP_ESTIMATE_2016"].map(lambda x: x.replace(",",''))
population_data["POP_ESTIMATE_2015"] = population_data["POP_ESTIMATE_2015"].map(lambda x: x.replace(",",''))
population_data["POP_ESTIMATE_2014"] = population_data["POP_ESTIMATE_2014"].map(lambda x: x.replace(",",''))
population_data["POP_ESTIMATE_2013"] = population_data["POP_ESTIMATE_2013"].map(lambda x: x.replace(",",''))

In [7]:
print(poverty_data.shape, education_data.shape, unemployment_data.shape, population_data.shape)

(3142, 34) (3221, 47) (3219, 52) (3220, 133)


So the poverty data has least cities or counties

In [8]:
# Add features from the datasets we get
from copy import deepcopy
processed_all = deepcopy(poverty_data[["State", "Area_Name", "PCTPOVALL_2016", "PCTPOV017_2016", "MEDHHINC_2016"]])
processed_all = processed_all.rename(columns={'PCTPOVALL_2016':'pov_all', 
                              "PCTPOV017_2016":"pov_under17",  
                             "MEDHHINC_2016":"median_income"})
# convert poverty values to float or int (original is string)
processed_all["pov_all"] = processed_all["pov_all"].map(lambda x: float(x))
processed_all["pov_under17"] = processed_all["pov_under17"].map(lambda x: float(x))
# processed_all["median_income"] = processed_all["median_income"].map(lambda x: int(x.replace(",",'')))

# features from datasets other than poverty_data
processed_all["no_high_school"] = 0
processed_all["no_bachelor"] = 0
processed_all["unemployment"] = 0
processed_all["pop"] = 0

# convert other values to float or int
for i in processed_all["Area_Name"]:
    if i not in education_data["Area name"].values or i not in unemployment_data["Area_name"].values or i not in population_data["Area_Name"].values:
        processed_all.drop([processed_all[processed_all["Area_Name"]==i].index[0]],inplace=True)
for i in processed_all["Area_Name"]:
    processed_all.loc[processed_all["Area_Name"]==i, "no_high_school"] = float(education_data["Percent of adults with less than a high school diploma, 2012-2016"][education_data["Area name"]==i].values[0])
    processed_all.loc[processed_all["Area_Name"]==i, "no_bachelor"] = (float(education_data["Percent of adults with less than a high school diploma, 2012-2016"][education_data["Area name"]==i].values[0]) +
                                                                       float(education_data["Percent of adults with a high school diploma only, 2012-2016"][education_data["Area name"]==i].values[0]) +
                                                                       float(education_data["Percent of adults completing some college or associate's degree, 2012-2016"][education_data["Area name"]==i].values[0]))
    processed_all.loc[processed_all["Area_Name"]==i, "unemployment"] = (float(unemployment_data["Unemployment_rate_2017"][unemployment_data["Area_name"]==i].values[0])+
                                                                       float(unemployment_data["Unemployment_rate_2016"][unemployment_data["Area_name"]==i].values[0])+
                                                                       float(unemployment_data["Unemployment_rate_2015"][unemployment_data["Area_name"]==i].values[0])+
                                                                       float(unemployment_data["Unemployment_rate_2014"][unemployment_data["Area_name"]==i].values[0])+
                                                                       float(unemployment_data["Unemployment_rate_2013"][unemployment_data["Area_name"]==i].values[0]))/5
    processed_all.loc[processed_all["Area_Name"]==i, "pop"] = (int(population_data["POP_ESTIMATE_2017"][population_data["Area_Name"]==i].values[0]) +
                                                              int(population_data["POP_ESTIMATE_2016"][population_data["Area_Name"]==i].values[0]) +
                                                              int(population_data["POP_ESTIMATE_2015"][population_data["Area_Name"]==i].values[0]) +
                                                              int(population_data["POP_ESTIMATE_2014"][population_data["Area_Name"]==i].values[0]) +
                                                              int(population_data["POP_ESTIMATE_2013"][population_data["Area_Name"]==i].values[0]))/5

# drop "County" in "Area_Name"    
processed_all["Area_Name"] =  processed_all["Area_Name"].map(lambda x: x.replace(" County",''))
processed_all

Unnamed: 0,State,Area_Name,pov_all,pov_under17,median_income,no_high_school,no_bachelor,unemployment,pop
2,AL,Autauga,14.0,19.0,54487,12.4,75.4,5.20,55035.8
3,AL,Baldwin,12.0,18.0,56460,10.0,70.5,5.62,203360.0
4,AL,Barbour,30.0,40.0,32884,26.2,87.1,8.80,26200.8
5,AL,Bibb,20.0,28.0,43079,19.3,88.0,6.60,22579.6
6,AL,Blount,14.0,19.0,47213,20.0,86.9,5.36,57666.8
7,AL,Bullock,33.0,46.0,34278,33.4,89.7,7.60,10478.2
8,AL,Butler,25.0,37.0,35409,18.9,83.9,7.84,20126.0
9,AL,Calhoun,17.0,26.0,41778,17.7,82.4,7.06,115527.2
10,AL,Chambers,20.0,33.0,39530,19.7,87.5,6.06,33893.6
11,AL,Cherokee,17.0,28.0,41456,18.7,86.1,5.52,25855.0


In [9]:
new_column = ['n_killed','n_injured','victims_involved','suspects_identified','teen_participants']
for i in new_column:
    processed_all[i] = np.nan
processed_all.head()

Unnamed: 0,State,Area_Name,pov_all,pov_under17,median_income,no_high_school,no_bachelor,unemployment,pop,n_killed,n_injured,victims_involved,suspects_identified,teen_participants
2,AL,Autauga,14.0,19.0,54487,12.4,75.4,5.2,55035.8,,,,,
3,AL,Baldwin,12.0,18.0,56460,10.0,70.5,5.62,203360.0,,,,,
4,AL,Barbour,30.0,40.0,32884,26.2,87.1,8.8,26200.8,,,,,
5,AL,Bibb,20.0,28.0,43079,19.3,88.0,6.6,22579.6,,,,,
6,AL,Blount,14.0,19.0,47213,20.0,86.9,5.36,57666.8,,,,,


In [10]:
from copy import deepcopy
# Then we can merge this with gun-violence data. First we read gun-violence and do some preprosessing according to our needs.
# I just copy from main.ipynb
all_data = load_data()
processed_data = deepcopy(all_data)
dropped_columns = ['incident_id', 'incident_url', 'incident_url_fields_missing', 'gun_stolen', 
                   'gun_type', 'location_description', 'n_guns_involved', 'notes', 'participant_age', 
                   'participant_name', 'participant_relationship', 'date']

processed_data = processed_data.drop(dropped_columns, axis=1)

# add new columns to our dataframe
new_columns = [['child_participants', 'teen_participants', 'adult_participants'],
              ['male_participants', 'female_participants'],
              ['arrested_participants', 'injured_participants', 'killed_participants', 'unharmed_participants'],
              ['victims_involved', 'suspects_identified']];

for item in new_columns:
    for col in item:
        processed_data[col] = 0
        
processed_data_new = processed_data.copy()

new_columns = [['child_participants', 'teen_participants', 'adult_participants'],
              ['male_participants', 'female_participants'],
              ['arrested_participants', 'injured_participants', 'killed_participants', 'unharmed_participants'],
              ['victims_involved', 'suspects_identified']];

info_list = [
               ['Child', 'Teen','Adult'],
               ['Male', 'Female'],
               ['Arrested', 'Injured', 'Killed', 'Unharmed'],
               ['Victim', 'Suspect']
            ];

def extractInfoFromColumns(row_index, column_name_list, info_list, new_columns):
    
    temp_dict = {}
    for i in range(len(column_name_list)):
        column_name = column_name_list[i]
        line = processed_data.at[row_index,column_name]        
        #print(line)
        #break
        s = str(line).replace('||', '|')
        arr = s.split('|')
        for e in arr:
            e = e.replace('::', ':')
            if e.find(':') != -1:
                e = e[e.find(':')+1:]
            
            if ", Arrested" in e:
                e= 'Arrested'
            
            #print(e)
            item = info_list[i]                
            for j in range(len(item)):
                #print(item[j])
                if item[j] in e:
                    new_col = new_columns[i][j]
                    if new_col not in temp_dict:
                        #print(e)
                        temp_dict[new_col] = 1
                    else:
                        temp_dict[new_col] += 1
                #else:
                    #print('not found '+item[j])
    #print(temp_dict)
    for key, value in temp_dict.items():
        processed_data_new.at[row_index,key] = value
        
cols = ['participant_age_group', 'participant_gender', 'participant_status', 'participant_type']
for i in range(len(processed_data)):
    extractInfoFromColumns(i, cols, info_list, new_columns)

dropped_columns = ['congressional_district','incident_characteristics','latitude','longitude',
                  'participant_age_group', 'participant_gender', 'participant_status','participant_type',
                  'killed_participants','injured_participants','state_house_district', 'state_senate_district',
                  'child_participants', 'adult_participants', 'male_participants', 'female_participants',
                   'arrested_participants', 'unharmed_participants']
processed_data_new = processed_data_new.drop(dropped_columns, axis=1)
processed_data_new.head()

Unnamed: 0,state,city_or_county,address,n_killed,n_injured,source_url,sources,teen_participants,victims_involved,suspects_identified
0,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.post-gazette.com/local/south/2013/0...,http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,0,4,1
1,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.dailybulletin.com/article/zz/201301...,http://losangeles.cbslocal.com/2013/01/01/man-...,0,4,1
2,Ohio,Lorain,1776 East 28th Street,1,3,http://chronicle.northcoastnow.com/2013/02/14/...,http://www.morningjournal.com/general-news/201...,0,3,2
3,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.dailydemocrat.com/20130106/aurora-s...,http://denver.cbslocal.com/2013/01/06/officer-...,0,3,1
4,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.journalnow.com/news/local/article_d...,http://myfox8.com/2013/01/08/update-mother-sho...,1,3,1


In [11]:
processed_data_new["source_url"] = processed_data_new["source_url"].map(lambda x: str(x))
processed_data_new["sources"] = processed_data_new["sources"].map(lambda x: str(x))
processed_data_new["address"] = processed_data_new["address"].map(lambda x: str(x))
processed_data_new["city_or_county"] = processed_data_new["city_or_county"].map(lambda x: str(x))

In [15]:
# states_dict = {
#         'AK': 'Alaska',
#         'AL': 'Alabama',
#         'AR': 'Arkansas',
#         'AS': 'American Samoa',
#         'AZ': 'Arizona',
#         'CA': 'California',
#         'CO': 'Colorado',
#         'CT': 'Connecticut',
#         'DC': 'District of Columbia',
#         'DE': 'Delaware',
#         'FL': 'Florida',
#         'GA': 'Georgia',
#         'GU': 'Guam',
#         'HI': 'Hawaii',
#         'IA': 'Iowa',
#         'ID': 'Idaho',
#         'IL': 'Illinois',
#         'IN': 'Indiana',
#         'KS': 'Kansas',
#         'KY': 'Kentucky',
#         'LA': 'Louisiana',
#         'MA': 'Massachusetts',
#         'MD': 'Maryland',
#         'ME': 'Maine',
#         'MI': 'Michigan',
#         'MN': 'Minnesota',
#         'MO': 'Missouri',
#         'MP': 'Northern Mariana Islands',
#         'MS': 'Mississippi',
#         'MT': 'Montana',
#         'NA': 'National',
#         'NC': 'North Carolina',
#         'ND': 'North Dakota',
#         'NE': 'Nebraska',
#         'NH': 'New Hampshire',
#         'NJ': 'New Jersey',
#         'NM': 'New Mexico',
#         'NV': 'Nevada',
#         'NY': 'New York',
#         'OH': 'Ohio',
#         'OK': 'Oklahoma',
#         'OR': 'Oregon',
#         'PA': 'Pennsylvania',
#         'PR': 'Puerto Rico',
#         'RI': 'Rhode Island',
#         'SC': 'South Carolina',
#         'SD': 'South Dakota',
#         'TN': 'Tennessee',
#         'TX': 'Texas',
#         'UT': 'Utah',
#         'VA': 'Virginia',
#         'VI': 'Virgin Islands',
#         'VT': 'Vermont',
#         'WA': 'Washington',
#         'WI': 'Wisconsin',
#         'WV': 'West Virginia',
#         'WY': 'Wyoming'
# }

# # Divide gun-violence data according to the state it took place in
# subset_list = {}
# for i in states_dict:
#     subset_list[i] = processed_data_new[processed_data_new['state'] == states_dict[i]]    

# for i in range(len(processed_all)):
#     state_abbr = processed_all.iloc[i]['State']
#     area = processed_all.iloc[i]['Area_Name']
#     area_low = area.lower()
# #     subset = processed_data_new[processed_data_new['state'] == states_dict[state_abbr]]
#     for j in range(len(subset_list[state_abbr])):        
#         if area in subset_list[state_abbr].iloc[j]['city_or_county'] or area in subset_list[state_abbr].iloc[j]['address'] or area_low in subset_list[state_abbr].iloc[j]['source_url'] or area_low in subset_list[state_abbr].iloc[j]['sources']:
#             if np.isnan(processed_all.iloc[i]['n_killed']):
#                 for k in new_column:
#                     processed_all.iloc[i, processed_all.columns.get_loc(k)] = 0
#             for k in new_column:
#                     processed_all.iloc[i, processed_all.columns.get_loc(k)] += subset_list[state_abbr].iloc[j][k]

# processed_all = processed_all.dropna(subset=new_column) 
# processed_all             

Unnamed: 0,State,Area_Name,pov_all,pov_under17,median_income,no_high_school,no_bachelor,unemployment,pop,n_killed,n_injured,victims_involved,suspects_identified,teen_participants
2,AL,Autauga,14.0,19.0,54487,12.4,75.4,5.20,55035.8,8.0,2.0,14.0,23.0,0.0
3,AL,Baldwin,12.0,18.0,56460,10.0,70.5,5.62,203360.0,14.0,2.0,11.0,21.0,0.0
4,AL,Barbour,30.0,40.0,32884,26.2,87.1,8.80,26200.8,4.0,0.0,4.0,2.0,0.0
5,AL,Bibb,20.0,28.0,43079,19.3,88.0,6.60,22579.6,4.0,3.0,8.0,8.0,0.0
6,AL,Blount,14.0,19.0,47213,20.0,86.9,5.36,57666.8,23.0,17.0,35.0,36.0,2.0
7,AL,Bullock,33.0,46.0,34278,33.4,89.7,7.60,10478.2,2.0,0.0,4.0,6.0,0.0
8,AL,Butler,25.0,37.0,35409,18.9,83.9,7.84,20126.0,8.0,7.0,15.0,16.0,1.0
9,AL,Calhoun,17.0,26.0,41778,17.7,82.4,7.06,115527.2,10.0,5.0,17.0,17.0,2.0
10,AL,Chambers,20.0,33.0,39530,19.7,87.5,6.06,33893.6,4.0,4.0,8.0,3.0,0.0
11,AL,Cherokee,17.0,28.0,41456,18.7,86.1,5.52,25855.0,16.0,15.0,39.0,26.0,1.0


In [16]:
# write the dataframe we get to a csv file because it's time consuming
# processed_all.to_csv('more_features.csv')

In [19]:
processed_all = load_data(DATA_PATH, "more_features.csv")
processed_all.head()

Unnamed: 0.1,Unnamed: 0,State,Area_Name,pov_all,pov_under17,median_income,no_high_school,no_bachelor,unemployment,pop,n_killed,n_injured,victims_involved,suspects_identified,teen_participants
0,2,AL,Autauga,14.0,19.0,54487,12.4,75.4,5.2,55035.8,8.0,2.0,14.0,23.0,0.0
1,3,AL,Baldwin,12.0,18.0,56460,10.0,70.5,5.62,203360.0,14.0,2.0,11.0,21.0,0.0
2,4,AL,Barbour,30.0,40.0,32884,26.2,87.1,8.8,26200.8,4.0,0.0,4.0,2.0,0.0
3,5,AL,Bibb,20.0,28.0,43079,19.3,88.0,6.6,22579.6,4.0,3.0,8.0,8.0,0.0
4,6,AL,Blount,14.0,19.0,47213,20.0,86.9,5.36,57666.8,23.0,17.0,35.0,36.0,2.0


In [20]:
processed_all = processed_all.drop(['Unnamed: 0','State','Area_Name'],axis=1)
corr_matrix = processed_all.corr()
corr_matrix["n_killed"].sort_values(ascending=False)

n_killed               1.000000
victims_involved       0.955450
n_injured              0.911928
suspects_identified    0.908647
teen_participants      0.887117
pop                    0.313546
unemployment           0.013082
pov_all                0.012092
pov_under17            0.011385
no_high_school         0.000935
no_bachelor           -0.074659
Name: n_killed, dtype: float64