# Acquire 2019 FBI data from the web for each state

In [10]:
import requests as rq
import requests_cache
import lxml.html as lx
from bs4 import BeautifulSoup
import pandas as pd
import re
from io import StringIO

In [11]:
states = ["alabama", "alaska", "arizona", "arkansas", "california", "colorado", "connecticut","delaware", "district_of_columbia",
           "florida", "georgia", "hawaii", "idaho","illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland",
           "massachusetts", "michigan", "minnesota", "mississippi", "missouri", "montana", "nebraska", "nevada", "newhampshire", "newjersey",
           "newmexico", "newyork", "north_carolina", "north_dakota", "ohio", "oklahoma", "oregon", "pennsylvania", "rhode_island", "south_carolina",
           "south_dakota", "tennessee", "texas", "utah", "vermont", "virginia", "washington", "west_virginia", "wisconsin", "wyoming"]

session = requests_cache.CachedSession("crime_data")

In [29]:
def get_data(state_list):
    all_data = pd.DataFrame()
    url = "https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/tables/table-8/table-8-state-cuts/"
    names_lst = ['city', 'population', 'violent crime', 'murder and nonnegligent manslaughter', 'rape', 'robbery', 'aggravated assault',
                'property crime', 'burglary', 'larceny-theft', 'motor vehicle theft', 'arson']
    for state in state_list:
        request = session.get(url + state + ".xls")
        soup = BeautifulSoup(request.content, 'html.parser')
        table = soup.find("table", {"class": "data"})
        data_frame = pd.read_html(StringIO(str(table)))[0]
        data_frame.columns = data_frame.columns[:0].to_list() + names_lst
        state_name = state.replace("_", " ")
        if state in ["newhampshire", "newjersey", "newmexico", "newyork"]:
            state_name = state[0:3] + " " + state[3:]
        state_name = state_name.title()
        data_frame["state"] = state_name
        all_data = pd.concat([all_data, data_frame])
    all_data = all_data.reset_index()
    all_data = all_data.drop(["index"], axis= 1)
    all_data["year"] = 2019
    return all_data

In [30]:
all_states = get_data(states)

In [34]:
all_states = all_states.sort_values("population", ascending= False)
top_30 = all_states[:30]

In [40]:
top_30["city"] = top_30["city"].str.strip("[0123456789,]")
top_30.iloc[5,0] = "Las Vegas"
top_30 = top_30.replace("Metropolitan Nashville Police Department", "Nashville")
top_30 = top_30.replace("Louisville Metro", "Louisville")
top_30 = top_30.reset_index()
top_30 = top_30.drop(["index"], axis=1)
top_30

Unnamed: 0,city,population,violent crime,murder and nonnegligent manslaughter,rape,robbery,aggravated assault,property crime,burglary,larceny-theft,motor vehicle theft,arson,state,year
0,New York,8379043.0,47821.0,319.0,2770.0,13396.0,31336.0,122299.0,9846.0,106931.0,5522.0,,New York,2019
1,Los Angeles,4015546.0,29400.0,258.0,2274.0,9652.0,17216.0,95704.0,13809.0,66253.0,15642.0,1672.0,California,2019
2,Chicago,2707064.0,25532.0,492.0,1761.0,7983.0,15296.0,80742.0,9578.0,62083.0,9081.0,416.0,Illinois,2019
3,Houston,2355606.0,25257.0,275.0,1249.0,9147.0,14586.0,101750.0,17038.0,71614.0,13098.0,485.0,Texas,2019
4,Phoenix,1688722.0,11803.0,131.0,1139.0,3197.0,7336.0,55974.0,9471.0,39427.0,7076.0,201.0,Arizona,2019
5,Las Vegas,1666803.0,8854.0,84.0,1439.0,2118.0,5213.0,46197.0,10646.0,28240.0,7311.0,125.0,Nevada,2019
6,San Antonio,1559166.0,11046.0,105.0,1630.0,1965.0,7346.0,67422.0,8172.0,51469.0,7781.0,181.0,Texas,2019
7,San Diego,1441737.0,5215.0,50.0,561.0,1346.0,3258.0,27141.0,3543.0,18426.0,5172.0,122.0,California,2019
8,Dallas,1363295.0,11764.0,198.0,797.0,4400.0,6369.0,45279.0,9210.0,25812.0,10257.0,144.0,Texas,2019
9,San Jose,1040008.0,4559.0,32.0,671.0,1339.0,2517.0,25164.0,4114.0,14924.0,6126.0,135.0,California,2019


In [45]:
top_30.to_csv(path_or_buf="../clean_data/2019_crime_data.csv")