# Data Science Project

<h3>Research Question</h3>

<h3>Data Collection & Cleaning</h3>

In [1]:
# Setup
import matplotlib.pyplot as plt
import datetime
import random
import numpy as np
import pandas as pd
import re

In [7]:
# Smaller dataset but over longer time period
s1 = pd.read_csv("us_shootings_1982-2017.csv")

# Clean up the column names
s1.columns = s1.columns.str.strip().str.lower().str.replace(' ', '_')

# Function to convert string values to numbers
def num_only(val):
    return re.findall(r'[0-9]+', val)[0]

# Formats the gender into 3 distinct options
def clean_gender(val):
    if val == 'Male & Female':
        return 'Both'
    else:
        return val[0].upper()
    
def clean_venue(val):
    return val.replace('\n','')

# clean up some of the columns
s1["fatalities"] = pd.to_numeric(s1["fatalities"].apply(num_only))
s1["injured"] = pd.to_numeric(s1["injured"].apply(num_only))
s1["total_victims"] = pd.to_numeric(s1["total_victims"].apply(num_only))
s1["race"] = s1["race"].str.strip().str.lower()
s1["gender"] = s1["gender"].apply(clean_gender)
s1["venue"] = s1["venue"].apply(clean_venue)

# keep only relevant columns
s1 = s1.drop(columns=['case', 'race', 'gender','summary','sources','where_obtained',
                      'type_of_weapons','mental_health_-_details','mental_health_sources',
                     'weapons_obtained_legally','weapon_details','latitude','longitude'])

# print unique venues
print("Unique Venues: " + str(s1['venue'].unique()))

# display description of data
print('')
print(s1.describe())

# print a preview of the data
s1.head(15)

Unique Venues: ['Religious' 'Other' 'Workplace' 'Airport' 'School' 'Military']

              year  fatalities     injured  total_victims
count    94.000000   94.000000   94.000000      94.000000
mean   2005.521277    8.404255   12.691489      21.095745
std      10.105602    8.529921   50.856329      57.020100
min    1982.000000    3.000000    0.000000       3.000000
25%    1998.000000    5.000000    1.000000       7.000000
50%    2008.500000    6.000000    3.000000      10.500000
75%    2014.000000    8.000000   10.000000      18.000000
max    2017.000000   58.000000  489.000000     547.000000


Unnamed: 0,location,date,year,fatalities,injured,total_victims,venue,prior_signs_of_mental_health_issues,type
0,"Sutherland Springs, TX",11/5/17,2017,26,20,46,Religious,Yes,Mass
1,"Thornton, CO",11/1/17,2017,3,0,3,Other,Unclear,Mass
2,"Edgewood, MD",10/18/17,2017,3,3,6,Workplace,Unclear,Mass
3,"Las Vegas, NV",10/1/17,2017,58,489,547,Other,TBD,Mass
4,"San Francisco, CA",6/14/17,2017,3,2,5,Workplace,Yes,Mass
5,"Tunkhannock, PA",6/7/17,2017,3,0,3,Workplace,Unclear,Mass
6,"Orlando, Florida",6/5/2017,2017,5,0,5,Workplace,Unclear,Mass
7,"Kirkersville, Ohio",5/12/2017,2017,3,0,3,Workplace,Yes,Mass
8,"Fresno, California",4/18/2017,2017,3,0,3,Other,Unclear,Mass
9,"Fort Lauderdale, Florida",1/6/2017,2017,5,6,11,Airport,Yes,Mass


In [6]:
# very large (250k+ incidents) dataset from https://www.kaggle.com/jameslko/gun-violence-data
# in s2_clean, some unnecessary columns have been removed to reduce size
s2c = pd.read_csv("s2_clean.csv")

# Display description of the data
print(s2c.describe())

# Get total killed and total injured
print("\nTotal killed: " + str(s2c['n_killed'].sum()))
print("Total injured: " + str(s2c['n_injured'].sum()))

s2c = s2c.drop(columns=['address', 'location_description', 'longitude', 'latitude'])
s2c.head(15)

          Unnamed: 0   incident_id       n_killed      n_injured  \
count  239677.000000  2.396770e+05  239677.000000  239677.000000   
mean   119838.000000  5.593343e+05       0.252290       0.494007   
std     69188.934572  2.931287e+05       0.521779       0.729952   
min         0.000000  9.211400e+04       0.000000       0.000000   
25%     59919.000000  3.085450e+05       0.000000       0.000000   
50%    119838.000000  5.435870e+05       0.000000       0.000000   
75%    179757.000000  8.172280e+05       0.000000       1.000000   
max    239676.000000  1.083472e+06      50.000000      53.000000   

       congressional_district       latitude      longitude  n_guns_involved  \
count           227733.000000  231754.000000  231754.000000    140226.000000   
mean                 8.001265      37.546598     -89.338348         1.372442   
std                  8.480835       5.130763      14.359546         4.678202   
min                  0.000000      19.111400    -171.429000        

Unnamed: 0.1,Unnamed: 0,incident_id,date,state,city_or_county,n_killed,n_injured,congressional_district,incident_characteristics,n_guns_involved,participant_age,participant_age_group,participant_gender,state_house_district,state_senate_district
0,0,461105,2013-01-01,Pennsylvania,Mckeesport,0,4,14.0,Shot - Wounded/Injured||Mass Shooting (4+ vict...,,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,,
1,1,460726,2013-01-01,California,Hawthorne,1,3,43.0,"Shot - Wounded/Injured||Shot - Dead (murder, a...",,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,62.0,35.0
2,2,478855,2013-01-01,Ohio,Lorain,1,3,9.0,"Shot - Wounded/Injured||Shot - Dead (murder, a...",2.0,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,56.0,13.0
3,3,478925,2013-01-05,Colorado,Aurora,4,0,6.0,"Shot - Dead (murder, accidental, suicide)||Off...",,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,40.0,28.0
4,4,478959,2013-01-07,North Carolina,Greensboro,2,2,6.0,"Shot - Wounded/Injured||Shot - Dead (murder, a...",2.0,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,62.0,27.0
5,5,478948,2013-01-07,Oklahoma,Tulsa,4,0,1.0,"Shot - Dead (murder, accidental, suicide)||Hom...",,0::23||1::23||2::33||3::55,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Female||2::Female||3::Female||4:...,72.0,11.0
6,6,479363,2013-01-19,New Mexico,Albuquerque,5,0,1.0,"Shot - Dead (murder, accidental, suicide)||Mas...",2.0,0::51||1::40||2::9||3::5||4::2||5::15,0::Adult 18+||1::Adult 18+||2::Child 0-11||3::...,0::Male||1::Female||2::Male||3::Female||4::Fem...,10.0,14.0
7,7,479374,2013-01-21,Louisiana,New Orleans,0,5,2.0,Shot - Wounded/Injured||Drive-by (car to stree...,,,,0::Male||1::Male||2::Male||3::Male||4::Male,93.0,5.0
8,8,479389,2013-01-21,California,Brentwood,0,4,9.0,Shot - Wounded/Injured||Drive-by (car to stree...,,,0::Teen 12-17||1::Teen 12-17||2::Teen 12-17||4...,0::Male||1::Male||2::Male||3::Male||4::Male,11.0,7.0
9,9,492151,2013-01-23,Maryland,Baltimore,1,6,7.0,"Shot - Wounded/Injured||Shot - Dead (murder, a...",,0::15,0::Teen 12-17||1::Adult 18+||2::Adult 18+||3::...,0::Male,,44.0


<h3>Data Description</h3>

<h3>Data Limitations</h3>

<h3>Exploratory Data Analysis</h3>

<h3>Questions for Reviewers</h3>

For what purpose was the dataset created?
Who created the dataset?
Who funded the creation of the dataset?
Any other comments