## Messing with data

In [1]:
import pandas as pd
import numpy as np

In [None]:
from matplotlib import pyplot as plt
from scipy import stats

In [2]:
data = pd.read_csv('clean_murder_data.csv')
stateAbbr = pd.read_csv("StatesAbbr.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
print(data.columns)
print(stateAbbr.columns)

## Creating State database

In [7]:
# Total Murders by state since 1976
totalMurders = data.groupby('State')['CaseID'].count()
print(totalMurders.head())
totalMurders_df = totalMurders.reset_index().rename(columns={'CaseID':'TotalMurders'})
print(totalMurders_df.head())

State
Alabama        15687
Alaska          1952
Arizona        14482
Arkansas        8081
California    114621
Name: CaseID, dtype: int64
        State  TotalMurders
0     Alabama         15687
1      Alaska          1952
2     Arizona         14482
3    Arkansas          8081
4  California        114621


In [8]:
# Number of UNSOLVED homicides by state since 1976
unsolved = data.loc[data['Solved'] == 'No']
unsolved = unsolved.groupby('State')['CaseID'].count()
print(unsolved.head())
unsolved_df = unsolved.reset_index().rename(columns={'CaseID':'TotalUnsolved'})
print(unsolved_df.head())

State
Alabama        3350
Alaska          362
Arizona        4065
Arkansas       1217
California    41459
Name: CaseID, dtype: int64
        State  TotalUnsolved
0     Alabama           3350
1      Alaska            362
2     Arizona           4065
3    Arkansas           1217
4  California          41459


In [9]:
# Percent of UNSOLVED homicides by state since 1976
percentUnsolved = (unsolved/totalMurders)*100
percentUnsolved = percentUnsolved.round(1)
print(percentUnsolved.round(1).head())
percentUnsolved_df = pd.DataFrame(percentUnsolved).reset_index().rename(columns={'CaseID':'PercentUnsolved'})
print(percentUnsolved_df.head())

State
Alabama       21.4
Alaska        18.5
Arizona       28.1
Arkansas      15.1
California    36.2
Name: CaseID, dtype: float64
        State  PercentUnsolved
0     Alabama             21.4
1      Alaska             18.5
2     Arizona             28.1
3    Arkansas             15.1
4  California             36.2


In [10]:
# Number of SOLVED homicides by state since 1976
solved = data.loc[data['Solved'] == 'Yes']
solved = solved.groupby('State')['CaseID'].count()
print(solved.head())
solved_df = solved.reset_index().rename(columns={'CaseID':'TotalSolved'})
print(solved_df.head())

State
Alabama       12337
Alaska         1590
Arizona       10417
Arkansas       6864
California    73162
Name: CaseID, dtype: int64
        State  TotalSolved
0     Alabama        12337
1      Alaska         1590
2     Arizona        10417
3    Arkansas         6864
4  California        73162


In [11]:
# Percent of SOLVED homicides by state since 1976
percentSolved = (solved/totalMurders)*100
percentSolved = percentSolved.round(1)
print(percentSolved.head())
percentSolved_df = pd.DataFrame(percentSolved).reset_index().rename(columns={'CaseID':'PercentSolved'})
print(percentSolved_df.head())

State
Alabama       78.6
Alaska        81.5
Arizona       71.9
Arkansas      84.9
California    63.8
Name: CaseID, dtype: float64
        State  PercentSolved
0     Alabama           78.6
1      Alaska           81.5
2     Arizona           71.9
3    Arkansas           84.9
4  California           63.8


In [12]:
# stateWeapon= data.groupby(['State','Weapon'])["CaseID"].count().reset_index()
# print(stateWeapon.head())
# stateWeapon = stateWeapon.sort_values(["CaseID"], ascending=False)
# stateWeapon_df = stateWeapon.drop_duplicates(["State"])
# print(stateWeapon_df.head())

In [14]:
# Merge all State data to create final csv
state_db = stateAbbr.merge(totalMurders_df,on="State")
state_db = state_db.merge(unsolved_df, on="State")
state_db = state_db.merge(percentUnsolved_df, on="State")
state_db = state_db.merge(solved_df, on="State")
state_db = state_db.merge(percentSolved_df, on="State")
print(state_db.head())
state_db.to_csv("state_db.csv", index=False)

        State Abbreviation  TotalMurders  TotalUnsolved  PercentUnsolved  \
0     Alabama           AL         15687           3350             21.4   
1      Alaska           AK          1952            362             18.5   
2     Arizona           AZ         14482           4065             28.1   
3    Arkansas           AR          8081           1217             15.1   
4  California           CA        114621          41459             36.2   

   TotalSolved  PercentSolved  
0        12337           78.6  
1         1590           81.5  
2        10417           71.9  
3         6864           84.9  
4        73162           63.8  


## Testing for Murder data by month

In [None]:
def month_string_to_number(string):
    m = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    s = string.strip()[:3].lower()

    try:
        out = m[s]
        return out
    except:
        raise ValueError('Not a month')

In [None]:
stateMurdersByMonth = data.groupby(['State','Month'])['CaseID'].count().reset_index()
#print(stateMurdersByMonth.head())
stateMurdersByMonth['MonthNums']=stateMurdersByMonth['Month'].map(month_string_to_number)
stateMurdersByMonth = stateMurdersByMonth.rename(columns={'CaseID':'TotalMurders'})
stateMurdersByMonth = stateMurdersByMonth.sort_values(['State',"MonthNums"], ascending=True)

plotala=stateMurdersByMonth.loc[stateMurdersByMonth['State']=='North Carolina']
print(plotala)
plt.errorbar(plotala.MonthNums, plotala['TotalMurders'], 
                 linestyle='--', marker='.', markersize=8)


In [None]:
pivot = stateMurdersByMonth.pivot(index='MonthNums', columns='State',values='TotalMurders')
print(pivot.head())
pivot = pivot.reset_index()
for state in stateAbbr['State']:
    plt.errorbar(pivot.MonthNums, pivot[state], fmt='2', 
                 linestyle='--', marker='.', markersize=8,capsize=4)
plt.show()