## Exploratory Data Analysis of Chicago Crime With Bokeh

In [134]:
#import libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
from bokeh.sampledata.autompg import autompg_clean as df
from bokeh.transform import factor_cmap
from bokeh.models import ColumnDataSource, Plot, LinearAxis, Grid
from bokeh.models.glyphs import VBar
from bokeh.io import curdoc, show
from bokeh.io import show, output_file
from bokeh.models import FactorRange
from bokeh.models import SingleIntervalTicker, LinearAxis
from bokeh.models import Range1d
import os
output_notebook()

In [2]:
#set working directory
os.chdir(os.getcwd()+'/Data')

In [3]:
#read in the raw crime data
file = 'Crimes_-_2001_to_present.csv'
crime = pd.read_csv(file)

In [4]:
#read in the shapefiles
beat_shape = gpd.read_file('geo_export_2947a4f2-ac66-45f1-902b-a7ffa42f326b.shp')
tract_shape = gpd.read_file('geo_export_ce0fbd70-d876-49ca-9432-ae1454719ab0.shp')

In [6]:
#map file to consolidate categories
crime_mapper = pd.read_csv('crime_mapper.csv')

In [7]:
#merge to get new mapping in crime data set
crime = pd.merge(left=crime,right=crime_mapper,how='left',
                 left_on='Primary Type',right_on='OLD_TYPE')

## General Descriptive Statistics

### Bar Plot of Crime Counts by Type

In [13]:
#create counts df for plot
crime_counts = pd.DataFrame(crime.groupby(['NEW_TYPE'])['ID'].count())
crime_counts.reset_index(inplace=True)
crime_counts = crime_counts.sort_values(by=['ID'],ascending=False)
crime_counts.reset_index(inplace=True)

crime_counts['NEW_TYPE'] = crime_counts['NEW_TYPE'].astype(str)
group = crime_counts.groupby(by=['NEW_TYPE'])

crimes = list(crime_counts['NEW_TYPE'])
counts = list(crime_counts['ID'])

In [24]:
#generate bokeh plot
source = ColumnDataSource(data=dict(x=crimes,y=counts,desc=counts,))

TOOLTIPS = [
    ("# of Crimes", "@desc"),
]

p = figure(plot_width=900,plot_height=600,title="Number of Crimes in Chicago by Type, 2001-Present",
           x_range=group, toolbar_location=None, tooltips=TOOLTIPS)


p.vbar(x='x', top='y', width=0.9,line_color="white",fill_color="#b3de69",
       hover_line_color="darkgrey",source=source,hover_fill_color='#ffffbf')

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Type of Crime Reported"
p.xaxis.major_label_orientation = 1.2

p.xgrid.grid_line_color = None
p.y_range.start = 0

p.x_range = FactorRange(factors=crime_counts['NEW_TYPE']) #orders 

#output_file("Number of Crimes Bar Chart.html")

show(p)

## % of Offenses by Offense Type

In [27]:
#create counts df for plot
crime_perc = pd.DataFrame(crime.groupby(['NEW_TYPE'])['ID'].count())
crime_perc.reset_index(inplace=True)
crime_perc = crime_counts.sort_values(by=['ID'],ascending=False)
crime_perc.reset_index(inplace=True)
crime_perc['ID'] = (crime_perc['ID'] / 6833941) * 100

crime_perc['NEW_TYPE'] = crime_perc['NEW_TYPE'].astype(str)
group = crime_perc.groupby(by=['NEW_TYPE'])

crimes = list(crime_perc['NEW_TYPE'])
counts = list(crime_perc['ID'])

In [31]:
#generate bokeh plot
source = ColumnDataSource(data=dict(x=crimes,y=counts,desc=counts,))

TOOLTIPS = [
    ("# of Crimes", "@desc"+"%"),
]

p = figure(plot_width=900,plot_height=600,title="% of Crimes in Chicago by Type, 2001-Present",
           x_range=group, toolbar_location=None, tooltips=TOOLTIPS)


p.vbar(x='x', top='y', width=0.9,line_color="white",fill_color="#ff9900",
       hover_line_color="darkgrey",source=source,hover_fill_color='#ffffbf')

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Type of Crime Reported"
p.xaxis.major_label_orientation = 1.2

p.xgrid.grid_line_color = None
p.y_range.start = 0

p.x_range = FactorRange(factors=crime_perc['NEW_TYPE']) #orders 

#output_file("Percent of Crimes Bar Chart.html")

show(p)

### Number of Crimes by Year

In [107]:
#create counts df for plot
crime_counts = pd.DataFrame(crime.groupby(['Year'])['ID'].count())
crime_counts.reset_index(inplace=True)
crime_counts = crime_counts.sort_values(by=['Year'],ascending=True)
crime_counts.reset_index(inplace=True)

crime_counts['Year'] = crime_counts['Year'].astype(str)
group = crime_counts.groupby(by=['Year'])

crimes = list(crime_counts['Year'])
counts = list(crime_counts['ID'])

In [110]:
#generate bokeh plot
source = ColumnDataSource(data=dict(x=crimes,y=counts,desc=counts,))

TOOLTIPS = [
    ("# of Crimes", "@desc"),
    ("Year", "@x"),
]

p = figure(plot_width=900,plot_height=600,title="Number of Crimes in Chicago by Year, 2001-Present",
           x_range=group, toolbar_location=None, tooltips=TOOLTIPS)


p.vbar(x='x', top='y', width=0.9,line_color="white",fill_color="#00802b",
       hover_line_color="darkgrey",source=source,hover_fill_color='#ffffbf')

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Type of Crime Reported"
p.xaxis.major_label_orientation = 1.2

p.xgrid.grid_line_color = None
p.y_range.start = 0

p.x_range = FactorRange(factors=crime_counts['Year']) #orders 

#output_file("Number of Crimes Bar Chart.html")

show(p)

# Temporal Analysis of Crime

### Total Time Series of Crime

In [140]:
#convert date column
crime['Date'] = pd.to_datetime(crime['Date'],errors='coerce')
crime['Day'] = crime['Date'].dt.date

In [141]:
#create counts df for plot
crime_time = pd.DataFrame(crime.groupby(['Day'])['ID'].count())
crime_time.reset_index(inplace=True)

crime_time['Day'] = crime_time['Day'].astype(str)
group = crime_time.groupby(by=['Day'])
day = list(crime_time['Day'])
counts = list(crime_time['ID'])

In [143]:
#generate bokeh plot
source = ColumnDataSource(data=dict(x=day,y=counts,desc=counts,))

TOOLTIPS = [
    ("# of Crimes", "@desc"),
    ("Date", "@x"),
]

p = figure(plot_width=1000,plot_height=600,title="Number of Crimes by Day, 2001-Present",
           x_range=group, toolbar_location=None,x_axis_type=None,tooltips=TOOLTIPS)


p.line(x='x', y='y',line_color="#ff9900",source=source)

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Type of Crime Reported"
p.xaxis.major_label_orientation = 1.2

p.xgrid.grid_line_color = None
p.y_range.start = 0

ticker = FixedTicker(ticks=list(range(0,7000,365)))
xaxis = LinearAxis(ticker=ticker,axis_label="Day")
p.add_layout(xaxis, 'below')
p.xaxis.major_label_overrides = {0: '2001', 365: '2002',730: '2003',1095: '2004',
                                 1460:'2005',1825:'2006',2190:'2007',2555:'2008',
                                 2920:'2009',3285:'2010',3650:'2011',4015:'2012',
                                 4380:'2013',4745:'2014',5110:'2015',5475:'2016',
                                 5840:'2017',6205:'2018',6570:'2019'}

#output_file("Time Series of Crime.html")

show(p)

### Time Series by Day of Year

In [114]:
crime['DOY'] = crime['Date'].dt.dayofyear

In [123]:
#create counts df for plot
crime_time = pd.DataFrame(crime.groupby(['DOY'])['ID'].count() / crime['Year'].nunique())
crime_time.reset_index(inplace=True)

crime_time['DOY'] = crime_time['DOY'].astype(str)
group = crime_time.groupby(by=['DOY'])
day = list(crime_time['DOY'])
counts = list(crime_time['ID'])

In [138]:
#generate bokeh plot
source = ColumnDataSource(data=dict(x=day,y=counts,desc=counts,))

TOOLTIPS = [
    ("# of Crimes", "@desc"),
    ("Day of Year", "@x"),
]

p = figure(plot_width=1000,plot_height=600,title="Average # of Crimes by Day of Year, 2001-Present",
           x_range=group, y_range=(500, 1500),toolbar_location=None,x_axis_type=None,tooltips=TOOLTIPS)


p.line(x='x', y='y',line_color="#000099",source=source)

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Day of Year"
p.xaxis.major_label_orientation = 1.2

p.xgrid.grid_line_color = None
p.y_range.start = 0

ticker = FixedTicker(ticks=list(range(0,365,10)))
xaxis = LinearAxis(ticker=ticker,axis_label="Day of Year")
p.add_layout(xaxis, 'below')

p.x_range = FactorRange(factors=crime_time['DOY']) 
p.y_range= Range1d(500, 1500)

#output_file("Time Series of Crime.html")

show(p)

### Time Series by Hour of the Day

In [144]:
crime['Hour'] = crime['Date'].dt.hour

In [146]:
crime['Day'].nunique()

6654

In [156]:
#create counts df for plot
crime_time = pd.DataFrame(crime.groupby(['Hour'])['ID'].count() / crime['Day'].nunique())
crime_time.reset_index(inplace=True)

crime_time['Hour'] = crime_time['Hour'].astype(str)
group = crime_time.groupby(by=['Hour'])
day = list(crime_time['Hour'])
counts = list(crime_time['ID'])

In [157]:
crime_time['ID'].sum()

1027.0425308085364

In [161]:
#generate bokeh plot
source = ColumnDataSource(data=dict(x=day,y=counts,desc=counts,))

TOOLTIPS = [
    ("# of Crimes", "@desc"),
    ("Hour of Day", "@x"),
]

p = figure(plot_width=1000,plot_height=600,title="Average # of Crimes by Day of Year, 2001-Present",
           x_range=group,toolbar_location=None,tooltips=TOOLTIPS)


p.line(x='x', y='y',line_color="#000099",source=source)

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Hour of the Day"
p.xaxis.major_label_orientation = 1.2

p.xgrid.grid_line_color = None
p.y_range.start = 0

#ticker = FixedTicker(ticks=list(range(0,365,10)))
#xaxis = LinearAxis(ticker=ticker,axis_label="Day of Year")
#p.add_layout(xaxis, 'below')

p.x_range = FactorRange(factors=crime_time['Hour']) 
p.y_range= Range1d(0, 80)

#output_file("Time Series of Crime.html")

show(p)