# Austin, TX Crime Analysis

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from census import Census
import gmaps
import requests
import json

%matplotlib inline

# usefuls.py
from usefuls import atx_zip_codes, zipcode_tabulation_area, similar_offense_type

# config.py not included on GitHub for obvious reasons
from config import gkey, census_key

# Configure gmaps
gmaps.configure(api_key=gkey)

## Get and clean data

### Get Property Data from Zillow
* The data file is too large to upload to GitHub, but it is obtainable from [here](https://www.kaggle.com/zillow/zecon#Zip_time_series.cs)

In [None]:
# Read the file
zillow_df = pd.read_csv('Zip_time_series.csv')

# Rename RegionName to zipcode
zillow_df = zillow_df.rename(columns={'RegionName': 'Zip Code'})

# Filter out all non-Austin area zipcodes
zillow_df = zillow_df[[(z in atx_zip_codes) for z in zillow_df['Zip Code']]]

# Add year column to dataframe
zillow_df['year'] = zillow_df['Date'].apply(lambda s : int(s[:4]))

# Filter out all years before 2009
zillow_df = zillow_df[(zillow_df['year'] >= 2011) & (zillow_df['year'] <= 2016)]

# Keep only the columns we need
zillow_df = zillow_df[['year', 'Zip Code', 'ZHVI_AllHomes']]

# Drop NaN values
zillow_df = zillow_df.dropna()

# Group data by year, zipcode, find the mean ZHVI per year per zipcode
zillow_df = round(zillow_df.groupby(['year', 'Zip Code']).ZHVI_AllHomes.mean(),2).to_frame()

# Write to csv
zillow_df.to_csv('zillow_data.csv')

# Preview the frame
zillow_df.head()

### Get Austin Police Department crime data
* The csv file used is obtainable from [here](https://data.austintexas.gov/Public-Safety/Crime-Reports/fdj4-gpfu)

In [None]:
crime_df = pd.read_csv('Crime_Reports.csv')

# Filter out non greater Austin zip codes
crime_df = crime_df[crime_df['Zip Code'].isin(atx_zip_codes)]

# Extract only years from date
crime_df['year'] = crime_df['Occurred Date'].apply(lambda d : int(d[-4:]))

# Drop unnecessary columns
crime_df = crime_df[['Zip Code', 'year', 'Highest Offense Description']]

# Look at only years 2011-2016
crime_df = crime_df[(crime_df['year'] >= 2011) & (crime_df['year'] <= 2016)]

# Convert zip codes to integer type
crime_df['Zip Code'] = crime_df['Zip Code'].apply(lambda x: int(x))

# Combine similar offenses
crime_df["Highest Offense Description"] = crime_df["Highest Offense Description"].replace(similar_offense_type)

# Write to csv
crime_df.to_csv('crime_data.csv', index=False)

# Preview
crime_df.head()

### Get census data

In [None]:
# Prepare dictionary of census data keyed by year
# We use what years are available: 2011 - 2016
# 2010 gives geography error; zipcode geography was not supported in that year
years = list(range(2011,2017))
# Census() results
cen = dict.fromkeys(years)
# cen.acs5.get() results
census_data = dict.fromkeys(years)
# Get census data
for y in years:
    cen[y] = Census(census_key, year=y)
    census_data[y] = cen[y].acs5.get(("B01003_001E"), {'for': zipcode_tabulation_area})

# Add year to each row-to-be
for y in years:
    for i, l in enumerate(census_data[y]):
            census_data[y][i]['year'] = y

# Convert census data to one dataframe
census_df = pd.DataFrame()
for y in years:
    census_df = census_df.append(pd.DataFrame(census_data[y]))

census_df = census_df.rename(columns={"B01003_001E": "Population", "zip code tabulation area": "Zip Code"})
# Convert population and Zip Code to integers
census_df['Zip Code'] = census_df['Zip Code'].apply(lambda x : int(x))
census_df['Population'] = census_df['Population'].apply(lambda x : int(x))

# Write to csv
census_df.to_csv('census_data.csv', index=False)

# Preview
census_df.head()

## Read above .csv files
* This is for development purposes so we don't have to reload and clean raw data files every time

In [2]:
zillow_df = pd.read_csv('zillow_data.csv')
crime_df = pd.read_csv('crime_data.csv')
census_df = pd.read_csv('census_data.csv')

In [3]:
zillow_df.head()

Unnamed: 0,year,Zip Code,ZHVI_AllHomes
0,2011,78610,167591.67
1,2011,78613,182450.0
2,2011,78617,104841.67
3,2011,78641,143600.0
4,2011,78664,130350.0


In [4]:
crime_df.head()

Unnamed: 0,Zip Code,year,Highest Offense Description
0,78701,2014,Burglary
1,78702,2015,Other
2,78759,2015,Motor Vehicle Theft
3,78741,2013,Burglary
4,78719,2016,Other


In [5]:
census_df.head()

Unnamed: 0,Population,year,Zip Code
0,22052,2011,78610
1,63901,2011,78613
2,18572,2011,78617
3,43655,2011,78641
4,4256,2011,78652


# Heat map of crime rates per zip code

## Step 1: Calculate crime rates
* Crimes rates being number of crimes per 100,000 people in a zip code

In [6]:
rates_df = crime_df.groupby(['year', 'Zip Code'])['Highest Offense Description'].count().to_frame().rename(columns={'Highest Offense Description' : 'Reports'}).reset_index()

rates_df = rates_df.merge(census_df, on=['year', 'Zip Code'])

rates_df['Crime Rate'] = round(100_000 * rates_df['Reports'] / rates_df['Population'],1)

rates_df.head()

Unnamed: 0,year,Zip Code,Reports,Population,Crime Rate
0,2011,78610,7,22052,31.7
1,2011,78613,449,63901,702.6
2,2011,78617,1094,18572,5890.6
3,2011,78652,33,4256,775.4
4,2011,78653,97,14631,663.0


## Step 2: Get latitude and longitude for each zip code

In [7]:
# Set up dictionaries keyed by Austin zip codes
# For latitude and longitude, respectively
lat = dict.fromkeys(atx_zip_codes)
lng = dict.fromkeys(atx_zip_codes)

# Make API calls to find out latitude and longitude for each zip code
for zipcode in atx_zip_codes:
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json?address={0}&key={1}'.format(zipcode, gkey)
    # Convert to JSON
    response = requests.get(base_url).json()
    # Extract lat/lng
    lat[zipcode] = response["results"][0]["geometry"]["location"]["lat"]
    lng[zipcode] = response["results"][0]["geometry"]["location"]["lng"]

# Add columns to dataframe for latitude and longitude
rates_df['Lat'] = rates_df['Zip Code'].apply(lambda z : lat[z])
rates_df['Lng'] = rates_df['Zip Code'].apply(lambda z : lng[z])

In [8]:
rates_df

Unnamed: 0,year,Zip Code,Reports,Population,Crime Rate,Lat,Lng
0,2011,78610,7,22052,31.7,30.081735,-97.842955
1,2011,78613,449,63901,702.6,30.511942,-97.817760
2,2011,78617,1094,18572,5890.6,30.147178,-97.589737
3,2011,78652,33,4256,775.4,30.135941,-97.877602
4,2011,78653,97,14631,663.0,30.333223,-97.546388
5,2011,78660,303,67630,448.0,30.433206,-97.600579
6,2011,78664,10,51942,19.3,30.504138,-97.660236
7,2011,78681,4,51409,7.8,30.518423,-97.709085
8,2011,78701,10312,5084,202832.4,30.272921,-97.744386
9,2011,78702,7868,20643,38114.6,30.260354,-97.714515


## Step 3: Set up and display heat map
* Note to self: Cannot get this to work so far. Shelving for now

In [9]:
fig = gmaps.figure()
# Create a heat layer
crime_zipcode = gmaps.heatmap_layer(rates_df[["Lat", "Lng"]].astype(float), weights= rates_df['Crime Rate'].astype(float), 
                                 dissipating=True, max_intensity=150000,
                                 point_radius = 30)

# Adjust heat_layer setting to help with heatmap dissipating on zoom
crime_zipcode.dissipating = True
crime_zipcode.max_intensity = 150000
crime_zipcode.point_radius = 30

fig.add_layer(crime_zipcode)

fig

###### !!!!
## Code not complete, need to figure out how to get the figure to display first before I proceed forward

Figure(layout=FigureLayout(height='420px'))