In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/gun-violence-data_01-2013_03-2018.csv')

In [3]:
df.head()

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||3::Male||4::Female,0::Julian Sims,,0::Arrested||1::Injured||2::Injured||3::Injure...,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,0::20,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male,0::Bernard Gillis,,0::Killed||1::Injured||2::Injured||3::Injured,0::Victim||1::Victim||2::Victim||3::Victim||4:...,http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,0::25||1::31||2::33||3::34||4::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Male||1::Male||2::Male||3::Male||4::Male,0::Damien Bell||1::Desmen Noble||2::Herman Sea...,,"0::Injured, Unharmed, Arrested||1::Unharmed, A...",0::Subject-Suspect||1::Subject-Suspect||2::Vic...,http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,0::29||1::33||2::56||3::33,0::Adult 18+||1::Adult 18+||2::Adult 18+||3::A...,0::Female||1::Male||2::Male||3::Male,0::Stacie Philbrook||1::Christopher Ratliffe||...,,0::Killed||1::Killed||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,0::18||1::46||2::14||3::47,0::Adult 18+||1::Adult 18+||2::Teen 12-17||3::...,0::Female||1::Male||2::Male||3::Female,0::Danielle Imani Jameison||1::Maurice Eugene ...,3::Family,0::Injured||1::Injured||2::Killed||3::Killed,0::Victim||1::Victim||2::Victim||3::Subject-Su...,http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [None]:
df.head(3)

In [None]:
#Indexing: Single Rows

#The simplest way to access a row is to pass the row number to the .iloc method. Note that first row is zero, just 
#like list indexes.


In [None]:
df.iloc[1]

In [None]:
#Indexing: Multiple Rows

#If we need multiple rows, we can pass in multiple index values. Note that this changes the order of the results!

#loc gets rows (or columns) with particular labels from the index.
#iloc gets rows (or columns) at particular positions in the index (so it only takes integers).

In [None]:
df.loc[[2, 3, 44]] #iloc and loc have the same search parameters because the index is named as a number (in this case)

In [None]:
df.loc[df['state'] == 'California']

In [None]:
#Slicing the dataframe just as if it were a list also works.

In [None]:
df[:3]

In [None]:
df[3:6]

In [None]:
#Indexing: Columns

#We can access a subset of the columns in a dataframe by placing the list of columns in brackets like so:

In [None]:
df['state'].head(3)   #df.State.head(3)  alternative .notation 

In [None]:
df.columns

In [None]:
#Indexing: Columns and Rows¶

#If we need to subset by both columns and rows, you can stack the commands we've already learned.

In [None]:
df[['state','n_killed']][:10]

In [None]:
#Indexing: Scalar Values

#As you may have noticed, everything we've tried so far returns a small dataframe or series. If you need a single 
#value, simply pass in a single column and index value.

In [None]:
df.state.iloc[2]

In [None]:
(df['state'] == 'California').head(4)

In [None]:
#We get a series of the results of the boolean. Passing that series into a dataframe gives us the subset of the 
#dataframe where the boolean evaluates to True.

In [None]:
df[df['state'] == 'California']

In [None]:
#Some of the logical operators are different:
    #~ replaces not
    #| replaces or
    #& replaces and
#If you have multiple arguments they'll need to be wrapped in parentheses. For example:

In [None]:
df[(df.n_killed > 5) & (df.state == 'California')]

In [None]:
df[df.state.isin(['California', 'Oregon', 'Washington'])]

In [None]:
#Less Common Methods

#Pandas offers many more indexing methods. You should probably stick to a few of them for the sake of keeping your code readable, but it's worth knowing they exist in case you need to read other people's code or have an unusual use case:

    #There are other ways to slice data with brackets. For the sake of readability, please don't use of them.
    #.at and .iat: like .loc and .iloc but much faster in exchange for only working on a single column and only 
    #returning a single result.
    #.eval: fast evaluation of a limited set of simple operators. .query works by calling this.
    #.ix: deprecated method that tried to determine if an index should be evaluated with .loc or .iloc. This led to a 
    #lot of subtle bugs! If you see this, you're looking at old code that won't work any more.
    #.get: like .loc, but will return a default value if the key doesn't exist in the index. Only works on a single 
    #column/series.
    #.lookup: Not recommended. It's in the documentation, but it's unclear if this is actually still supported.
    #.mask: like boolean indexing, but returns a dataframe/series of the same size as the original and anywhere that 
    #the boolean evaluates to True is set to nan.
    #.query: similar to boolean indexing. Faster for large dataframes. Only supports a restricted set of operations; 
    #don't use if you need isnull() or other dataframe methods.
    #.take: equivalent to .iloc, but can operate on either rows or columns.
    #.where: like boolean indexing, but returns a dataframe/series of the same size as the original and anywhere that 
    #the boolean evaluates to False is set to nan.
    #Multi-indexing: potentially useful for small to mid sized heirarchical datasets. Slow on larger datasets.



In [None]:
df.info()

In [None]:
#number of incidents in each state 

df['state'].value_counts()

In [None]:
df['state'].dtype

In [None]:
df.describe()

In [None]:
df['n_killed'].groupby(df['state']).sum().sort_values(ascending=False)

#total killed in each state 
#funcitonally the same as         df.groupby('state')['n_killed'].sum()

In [None]:
df['n_killed'].count()

In [None]:
df['n_injured'].count()

In [None]:
df['n_killed'].sum()

In [None]:
df['n_injured'].sum()

In [None]:
df.columns

In [None]:
plt.figure(figsize=(75,8))
sns.countplot(x='state', data=df)

In [None]:
df['participant_age'].value_counts()  #such general range may not give
#meaningful, detailed results 

In [None]:
df['n_killed'].value_counts()  #the amount of those killed per record (Row)

In [None]:
df['participant_type'].value_counts() 

In [None]:
df['gun_type'].value_counts()  #gun type counts

In [None]:
df['gun_stolen'].value_counts()

In [None]:
plt.figure(figsize=(75,8))   #how many killed per state 
sns.barplot(x='state', y='n_killed', data=df, estimator=np.sum)