## Cleaning Data

In [1]:
import pandas as pd

In [5]:
data = pd.read_csv('murder_data.csv')

In [6]:
data.columns

Index(['ID', 'CNTYFIPS', 'Ori', 'State', 'Agency', 'Agentype', 'Source',
       'Solved', 'Year', 'StateName', 'Month', 'Incident', 'ActionType',
       'Homicide', 'Situation', 'VicAge', 'VicSex', 'VicRace', 'VicEthnic',
       'OffAge', 'OffSex', 'OffRace', 'OffEthnic', 'Weapon', 'Relationship',
       'Circumstance', 'Subcircum', 'VicCount', 'OffCount', 'FileDate',
       'fstate', 'MSA'],
      dtype='object')

In [7]:
# Changes name of ID column to CaseID to prevent issues when creating sqlite database
data = data.rename(columns={'ID':'CaseID'})

In [8]:
# Changes VicAge string values to integers
data['VicAge'] = data['VicAge'].replace('Newborn or infant', 1)
data['VicAge'] = data['VicAge'].replace('Age unknown', 0)
data['VicAge'] = data['VicAge'].replace('99 years old and more', 99)
# Changes OffAge string values to integers
data['OffAge'] = data['OffAge'].replace('Offender unknown', 0)
data['OffAge'] = data['OffAge'].replace('99 years old and more', 99)

In [9]:
# Rhode Island was spelled incorrectly in FBI database...go figure.
data['State'] = data['State'].replace('Rhodes Island','Rhode Island')
data.loc[data['State']=='Rhode Island'].head(1)

Unnamed: 0,CaseID,CNTYFIPS,Ori,State,Agency,Agentype,Source,Solved,Year,StateName,...,OffEthnic,Weapon,Relationship,Circumstance,Subcircum,VicCount,OffCount,FileDate,fstate,MSA
604157,200708001CA03710,"San Diego, CA",RI00409,Rhode Island,Providence,Municipal police,FBI,Yes,1976,R I,...,Unknown or not reported,"Firearm, type not stated",Stranger,Other arguments,,0,0,30180,Rhode Island,"Providence-New Bedford-Fall River, RI-MA"


In [None]:
# Rewrites data to new CSV file
data.to_csv('clean_murder_data.csv')

## Add State Abbreviations column

In [12]:
stateAbbr = pd.read_csv("statesAbbr.csv")

In [None]:
# Creates a funciton whos output is a State's abbreviation based on State's name as input value
def state_to_stateAbbr(state):
    d = stateAbbr.set_index('State').T.to_dict('records')
    s = state.strip()
    try:
        abbr=d[0][s]
        return abbr
    except:
        raise ValueError('ERROR')

In [10]:
# Creates a new column for the State's abbreviation based on 'State' column
data['StateAbbr'] = data['State'].map(state_to_stateAbbr)