### Crime data is collected into two separate csv files. The first contains
### 40 years of data by state, and 10 years (in 10 xls files) by city


In [None]:
import numpy as np
import pandas as pd

# Crime data is collected into two separate csv files. The first contains
# 40 years of data by state, and 10 years (in 10 xls files) by city

# data in this csv contains estimates in instances of no reporting
df = pd.read_csv(
    "http://s3-us-gov-west-1.amazonaws.com/cg-d4b776d0-d898-4153-90c8-8336f86bdfec/estimated_crimes_1979_2018.csv")

# replace null values with 'US'
df['state_abbr'] = df['state_abbr'].replace(np.nan, 'US')

# add violent crime rate (vcr) and property crime rate (pcr) to dataframe
df['vcr'] = df['violent_crime'] / df['population']
df['pcr'] = df['property_crime'] / df['population']

# initialize a new dataframe for exporting

sand = pd.DataFrame(index=None)
sand['state'] = df['state_abbr']
sand['year'] = df['year']
sand['vcr'] = df['vcr']
sand['pcr'] = df['pcr']

# export to csv
sand.to_csv('./crime_data/state_crime.csv', index=False)

# read in xls files, skipping the headers and footers
xl2018 = pd.read_excel(
    './crime_data/Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_City_2018.xls',
    skiprows=3,
    skipfooter=10)
xl2017 = pd.read_excel(
    './crime_data/Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_City_2017.xls',
    skiprows=3,
    skipfooter=10)
xl2016 = pd.read_excel(
    './crime_data/Table_6_Offenses_Known_to_Law_Enforcement_by_State_by_City_2016.xls',
    skiprows=3,
    skipfooter=11)
xl2015 = pd.read_excel(
    './crime_data/Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_City_2015.xls',
    skiprows=3,
    skipfooter=10)
xl2014 = pd.read_excel('./crime_data/table-8.xls', skiprows=3, skipfooter=17)
xl2013 = pd.read_excel(
    './crime_data/Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_City_2013.xls',
    skiprows=3,
    skipfooter=10)
xl2012 = pd.read_excel(
    './crime_data/Table_8_Offenses_Known_to_Law_Enforcement_by_State_by_City_2012.xls',
    skiprows=3,
    skipfooter=7)
xl2011 = pd.read_excel(
    './crime_data/table_8_offenses_known_to_law_enforcement_by_state_by_city_2011.xls',
    skiprows=3,
    skipfooter=7)
xl2010 = pd.read_excel('./crime_data/10tbl08.xls', skiprows=3, skipfooter=7)
xl2009 = pd.read_excel('./crime_data/09tbl08.xls', skiprows=3, skipfooter=7)

# build a function to automatically clean the results and add to a new DF for
# import to database


def cleaner(x, year):
    """
    Takes a dataframe, changes state abbreviations, changes state NaNs,
    calculates violent crime and property crime rate and returns it as
    a new DataFrame (city_st, vcr, pcr) for the year passed in
     """
    # create new dataframe
    df = pd.DataFrame(columns=['city', 'vcr_' + year, 'pcr_' + year])

    # clean numbers from state column and put into new df
    df['city'] = x['State'].str.replace(r'\d+', '')
    # clean numbers from city column
    x['City'] = x['City'].str.replace(r'\d+', '')
    # clean column names
    if 'Violent\ncrime' in x.columns:
        x = x.rename(columns={'Violent\ncrime': 'Violent crime',
                              'Property\ncrime': 'Property crime'})

    # remove null values from column
    if x['City'].isnull().sum() >= 1:
        x['City'] = x['City'].replace(np.nan, 'None')

    # replace states with abbreviations
    df['city'] = df['city'].replace({"ALABAMA": "AL", "ALASKA": "AK", "ARIZONA": "AZ",
                                     "ARKANSAS": "AK", "CALIFORNIA": "CA",
                                     "COLORADO": "CO", "CONNECTICUT": "CT",
                                     "DELAWARE": "DE", "DISTRICT OF COLUMBIA": "DC",
                                     "FLORIDA": "FL", "GEORGIA": "GA", "HAWAII": "HI",
                                     "IDAHO": "ID", "ILLINOIS": "IL", "INDIANA": "IN",
                                     "IOWA": "IA", "KANSAS": "KS", "KENTUCKY": "KY",
                                     "LOUISIANA": "LA", "MAINE": "ME", "MARYLAND": "MD",
                                     "MASSACHUSETTS": "MA", "MICHIGAN": "MI",
                                     "MINNESOTA": "MN", "MISSISSIPPI": "MS",
                                     "MISSOURI": "MI", "MONTANA": "MT", "NEBRASKA": "NE",
                                     "NEVADA": "NV", "NEW HAMPSHIRE": "NH",
                                     "NEW JERSEY": "NJ", "NEW MEXICO": "NM",
                                     "NEW YORK": "NY", "NORTH CAROLINA": "NC",
                                     "NORTH DAKOTA": "ND", "OHIO": "OH",
                                     "OKLAHOMA": "OK", "OREGON": "OR",
                                     "PENNSYLVANIA": "PA", "RHODE ISLAND": "RI",
                                     "SOUTH CAROLINA": "SC", "SOUTH DAKOTA": "SD",
                                     "TENNESSEE": "TN", "TEXAS": "TX", "UTAH": "UT",
                                     "VERMONT": "VT", "VIRGINIA": "VA",
                                     "WASHINGTON": "WA", "WEST VIRGINIA": "WV",
                                     "WISCONSIN": "WI", "WYOMING": "WY"})
    # iterate through dataframe, replacing nan values with proper state abbr.
    state = ""
    for i in range(len(df)):
        if pd.notnull(df.at[i, 'city']):
            if df.at[i, 'city'] != state:
                state = df.at[i, 'city']
        elif pd.isnull(df.at[i, 'city']):
            df.at[i, 'city'] = state

    # populate city column 'city, ST'
    for i in range(len(df['city'])):
        df['city'][i] = x['City'][i] + ", " + df['city'][i]

        # populate violent crime rate column
        df['vcr_' + year][i] = x['Violent crime'][i] / x['Population'][i]

        # populate property crime rate column
        df['pcr_' + year][i] = x['Property crime'][i] / x['Population'][i]

    # set the index for later concatenation
    df.set_index('city')
    return df


cl18 = cleaner(xl2018, '2018')
cl17 = cleaner(xl2017, '2017')
cl16 = cleaner(xl2016, '2016')
cl15 = cleaner(xl2015, '2015')
cl14 = cleaner(xl2014, '2014')
cl13 = cleaner(xl2013, '2013')
cl12 = cleaner(xl2012, '2012')
cl11 = cleaner(xl2011, '2011')
cl10 = cleaner(xl2010, '2010')
cl09 = cleaner(xl2009, '2009')

# merge the dataframes
masta = pd.merge(cl18, cl17, how='outer', on='city')
masta2 = pd.merge(cl16, cl15, how='outer', on='city')
masta3 = pd.merge(cl14, cl13, how='outer', on='city')
masta4 = pd.merge(cl12, cl11, how='outer', on='city')
masta5 = pd.merge(cl10, cl09, how='outer', on='city')
master = pd.merge(masta, masta2, how='outer', on='city')
master = pd.merge(master, masta3, how='outer', on='city')
master = pd.merge(master, masta4, how='outer', on='city')
master = pd.merge(master, masta5, how='outer', on='city')

# export data
master.to_csv('./crime_data/crime.csv', index=False)

In [40]:
# data in this csv contains estimates in instances of no reporting
import numpy as np
import pandas as pd

In [41]:
import pandas as pd
crime = pd.read_csv('crime.csv')
print(crime.shape)
crime.head()


(24693, 21)


Unnamed: 0,city,vcr_2018,pcr_2018,vcr_2017,pcr_2017,vcr_2016,pcr_2016,vcr_2015,pcr_2015,vcr_2014,...,vcr_2013,pcr_2013,vcr_2012,pcr_2012,vcr_2011,pcr_2011,vcr_2010,pcr_2010,vcr_2009,pcr_2009
0,"Abbeville, AL",0.007056,0.019208,0.004633,0.025483,0.004218,0.019555,0.003448,0.029119,0.003027,...,0.004159,0.023819,0.008103,0.020994,0.004813,0.024065,0.007119,0.022712,0.00307,0.018076
1,"Adamsville, AL",0.004395,0.066852,0.005768,0.060222,0.004341,0.057117,0.005664,0.074762,0.006772,...,0.00424,0.071636,0.006163,0.064275,0.008143,0.066021,0.002085,0.061301,0.005315,0.072704
2,"Alabaster, AL",0.002746,0.017283,0.002225,0.015848,0.002936,0.01477,0.004148,0.016781,0.001837,...,0.001412,0.020533,0.001585,0.022087,0.001935,0.02151,0.001072,0.016731,0.001367,0.021165
3,"Albertville, AL",0.00112,0.037428,0.001019,0.030017,0.001347,0.033496,0.001393,0.033575,0.001062,...,,,,,0.002022,0.043129,0.003934,0.054012,0.004084,0.04622
4,"Alexander City, AL",0.021584,0.04193,0.017295,0.037371,0.01885,0.044709,0.014878,0.043894,0.005441,...,0.0081,0.04499,0.00636,0.046063,0.009233,0.051382,0.003874,0.042419,0.008767,0.055987


In [42]:
crime.isnull().sum()

city           0
vcr_2018    2635
pcr_2018    2485
vcr_2017    2885
pcr_2017    2137
vcr_2016    2125
pcr_2016    2140
vcr_2015    2303
pcr_2015    2310
vcr_2014    2398
pcr_2014    2413
vcr_2013    2424
pcr_2013    2424
vcr_2012    2440
pcr_2012    2235
vcr_2011    2592
pcr_2011    2388
vcr_2010    2618
pcr_2010    2419
vcr_2009    2892
pcr_2009    2584
dtype: int64

In [43]:
crime['city'].nunique()

11673

In [44]:
crime = crime.dropna()
crime.isna().sum()

city        0
vcr_2018    0
pcr_2018    0
vcr_2017    0
pcr_2017    0
vcr_2016    0
pcr_2016    0
vcr_2015    0
pcr_2015    0
vcr_2014    0
pcr_2014    0
vcr_2013    0
pcr_2013    0
vcr_2012    0
pcr_2012    0
vcr_2011    0
pcr_2011    0
vcr_2010    0
pcr_2010    0
vcr_2009    0
pcr_2009    0
dtype: int64

In [45]:
crime.shape

(17493, 21)

In [46]:
crime= crime.drop_duplicates(subset=['city'])
crime.reset_index(drop=True)
print(crime.shape)
crime.head()

(5031, 21)


Unnamed: 0,city,vcr_2018,pcr_2018,vcr_2017,pcr_2017,vcr_2016,pcr_2016,vcr_2015,pcr_2015,vcr_2014,...,vcr_2013,pcr_2013,vcr_2012,pcr_2012,vcr_2011,pcr_2011,vcr_2010,pcr_2010,vcr_2009,pcr_2009
0,"Abbeville, AL",0.007056,0.019208,0.004633,0.025483,0.004218,0.019555,0.003448,0.029119,0.003027,...,0.004159,0.023819,0.008103,0.020994,0.004813,0.024065,0.007119,0.022712,0.00307,0.018076
1,"Adamsville, AL",0.004395,0.066852,0.005768,0.060222,0.004341,0.057117,0.005664,0.074762,0.006772,...,0.00424,0.071636,0.006163,0.064275,0.008143,0.066021,0.002085,0.061301,0.005315,0.072704
2,"Alabaster, AL",0.002746,0.017283,0.002225,0.015848,0.002936,0.01477,0.004148,0.016781,0.001837,...,0.001412,0.020533,0.001585,0.022087,0.001935,0.02151,0.001072,0.016731,0.001367,0.021165
4,"Alexander City, AL",0.021584,0.04193,0.017295,0.037371,0.01885,0.044709,0.014878,0.043894,0.005441,...,0.0081,0.04499,0.00636,0.046063,0.009233,0.051382,0.003874,0.042419,0.008767,0.055987
9,"Ashford, AL",0.001865,0.014918,0.003254,0.006974,0.001388,0.014808,0.002304,0.029032,0.002311,...,0.000919,0.024805,0.000919,0.03125,0.004171,0.037998,0.001398,0.041007,0.002396,0.02252


In [48]:
# Also want to remove the state from the city names
cities = []
states = []
for x in crime["city"]:
    State = x[-2:]
    city = x[:-2]
    city = city.replace(",","")
    city = city.strip()
    
    cities.append(city)
    states.append(State)
crime = crime.copy()
crime["city"] = cities
crime['State'] = states

print(crime.shape)
crime.head()

(5031, 22)


Unnamed: 0,city,vcr_2018,pcr_2018,vcr_2017,pcr_2017,vcr_2016,pcr_2016,vcr_2015,pcr_2015,vcr_2014,...,pcr_2013,vcr_2012,pcr_2012,vcr_2011,pcr_2011,vcr_2010,pcr_2010,vcr_2009,pcr_2009,State
0,Abbeville,0.007056,0.019208,0.004633,0.025483,0.004218,0.019555,0.003448,0.029119,0.003027,...,0.023819,0.008103,0.020994,0.004813,0.024065,0.007119,0.022712,0.00307,0.018076,AL
1,Adamsville,0.004395,0.066852,0.005768,0.060222,0.004341,0.057117,0.005664,0.074762,0.006772,...,0.071636,0.006163,0.064275,0.008143,0.066021,0.002085,0.061301,0.005315,0.072704,AL
2,Alabaster,0.002746,0.017283,0.002225,0.015848,0.002936,0.01477,0.004148,0.016781,0.001837,...,0.020533,0.001585,0.022087,0.001935,0.02151,0.001072,0.016731,0.001367,0.021165,AL
4,Alexander City,0.021584,0.04193,0.017295,0.037371,0.01885,0.044709,0.014878,0.043894,0.005441,...,0.04499,0.00636,0.046063,0.009233,0.051382,0.003874,0.042419,0.008767,0.055987,AL
9,Ashford,0.001865,0.014918,0.003254,0.006974,0.001388,0.014808,0.002304,0.029032,0.002311,...,0.024805,0.000919,0.03125,0.004171,0.037998,0.001398,0.041007,0.002396,0.02252,AL


In [50]:
crime.to_csv('crime_clean.csv')