# Austin, TX Crime Analysis

In [8]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from census import Census

from usefuls import atx_zip_codes, census_key, zipcode_tabulation_area, similar_offense_type

## Get Property Data from Zillow
* The data file is too large to upload to GitHub, but it is obtainable from [here](https://www.kaggle.com/zillow/zecon#Zip_time_series.cs)

In [4]:
# Read the file
zillow_df = pd.read_csv('Zip_time_series.csv')

# Rename RegionName to zipcode
zillow_df = zillow_df.rename(columns={'RegionName': 'Zip Code'})

# Filter out all non-Austin area zipcodes
zillow_df = zillow_df[[(z in atx_zip_codes) for z in zillow_df['Zip Code']]]

# Add year column to dataframe
zillow_df['year'] = zillow_df['Date'].apply(lambda s : int(s[:4]))

# Filter out all years before 2009
zillow_df = zillow_df[(zillow_df['year'] >= 2011) & (zillow_df['year'] <= 2016)]

# Keep only the columns we need
zillow_df = zillow_df[['year', 'Zip Code', 'ZHVI_AllHomes']]

# Drop NaN values
zillow_df = zillow_df.dropna()

# Group data by year, zipcode, find the mean ZHVI per year per zipcode
zillow_df = round(zillow_df.groupby(['year', 'Zip Code']).ZHVI_AllHomes.mean(),2).to_frame()

# Preview the frame
zillow_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ZHVI_AllHomes
year,Zip Code,Unnamed: 2_level_1
2011,78610,167591.67
2011,78613,182450.0
2011,78617,104841.67
2011,78641,143600.0
2011,78664,130350.0


## Get Austin Police Department crime data
* The csv file used is obtainable from [here](https://data.austintexas.gov/Public-Safety/Crime-Reports/fdj4-gpfu)

In [None]:
crime_df = pd.read_csv('Crime_Reports.csv')

# Filter out non greater Austin zip codes
crime_df = crime_df[crime_df['Zip Code'].isin(atx_zip_codes)]

In [7]:
# Extract only years from date
crime_df['year'] = crime_df['Occurred Date'].apply(lambda d : int(d[-4:]))

# Drop unnecessary columns
crime_df = crime_df[['Zip Code', 'year', 'Highest Offense Description']]

# Look at only years 2011-2016
crime_df = crime_df[(crime_df['year'] >= 2011) & (crime_df['year'] <= 2016)]

# Convert zip codes to integer type
crime_df['Zip Code'] = crime_df['Zip Code'].apply(lambda x: int(x))

# Combine similar offenses
crime_df["Highest Offense Description"] = crime_df["Highest Offense Description"].replace(similar_offense_type)

# Preview
crime_df.head()

Unnamed: 0,Zip Code,year,Highest Offense Description
1,78701,2014,Burglary
2,78702,2015,Other
5,78759,2015,Motor Vehicle Theft
6,78741,2013,Burglary
8,78719,2016,Other


## Get census data

In [9]:
# Prepare dictionary of census data keyed by year
# We use what years are available: 2011 - 2016
# 2010 gives geography error; zipcode geography was not supported in that year
years = list(range(2011,2017))
# Census() results
cen = dict.fromkeys(years)
# cen.acs5.get() results
census_data = dict.fromkeys(years)
# Get census data
for y in years:
    cen[y] = Census(census_key, year=y)
    census_data[y] = cen[y].acs5.get(("B01003_001E"), {'for': zipcode_tabulation_area})

# Add year to each row-to-be
for y in years:
    for i, l in enumerate(census_data[y]):
            census_data[y][i]['year'] = y

# Convert census data to one dataframe
census_df = pd.DataFrame()
for y in years:
    census_df = census_df.append(pd.DataFrame(census_data[y]))

census_df = census_df.rename(columns={"B01003_001E": "Population", "zip code tabulation area": "Zip Code"})
# Convert population and Zip Code to integers
census_df['Zip Code'] = census_df['Zip Code'].apply(lambda x : int(x))
census_df['Population'] = census_df['Population'].apply(lambda x : int(x))

# Preview
census_df.head()

Unnamed: 0,Population,year,Zip Code
0,22052,2011,78610
1,63901,2011,78613
2,18572,2011,78617
3,43655,2011,78641
4,4256,2011,78652
