# Chicago Crimes Data Filtering

## load data between 2012 and 2017

In [104]:
import random
import json
from random import choices

import pandas as pd
import numpy as np
import shapely.wkt
from shapely.geometry.point import Point
from shapely.geometry.multipolygon import MultiPolygon
from shapely.geometry.polygon import Polygon
import matplotlib.pyplot as plt
from descartes import PolygonPatch
import matplotlib
import matplotlib.cm as cm
from matplotlib import colors

In [105]:
chicago = pd.read_csv('data/Chicago_Crimes_2012_to_2017.csv')

In [106]:
chicago.head()

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,3,10508693,HZ250496,05/03/2016 11:40:00 PM,013XX S SAWYER AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,...,24.0,29.0,08B,1154907.0,1893681.0,2016,05/10/2016 03:56:50 PM,41.864073,-87.706819,"(41.864073157, -87.706818608)"
1,89,10508695,HZ250409,05/03/2016 09:40:00 PM,061XX S DREXEL AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,...,20.0,42.0,08B,1183066.0,1864330.0,2016,05/10/2016 03:56:50 PM,41.782922,-87.604363,"(41.782921527, -87.60436317)"
2,197,10508697,HZ250503,05/03/2016 11:31:00 PM,053XX W CHICAGO AVE,470,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,...,37.0,25.0,24,1140789.0,1904819.0,2016,05/10/2016 03:56:50 PM,41.894908,-87.758372,"(41.894908283, -87.758371958)"
3,673,10508698,HZ250424,05/03/2016 10:10:00 PM,049XX W FULTON ST,460,BATTERY,SIMPLE,SIDEWALK,False,...,28.0,25.0,08B,1143223.0,1901475.0,2016,05/10/2016 03:56:50 PM,41.885687,-87.749516,"(41.885686845, -87.749515983)"
4,911,10508699,HZ250455,05/03/2016 10:00:00 PM,003XX N LOTUS AVE,820,THEFT,$500 AND UNDER,RESIDENCE,False,...,28.0,25.0,06,1139890.0,1901675.0,2016,05/10/2016 03:56:50 PM,41.886297,-87.761751,"(41.886297242, -87.761750709)"


In [107]:
chicago.describe()

Unnamed: 0.1,Unnamed: 0,ID,Beat,District,Ward,Community Area,X Coordinate,Y Coordinate,Year,Latitude,Longitude
count,1456714.0,1456714.0,1456714.0,1456713.0,1456700.0,1456674.0,1419631.0,1419631.0,1456714.0,1419631.0,1419631.0
mean,3308606.0,9597550.0,1150.644,11.2592,22.87027,37.45632,1164398.0,1885523.0,2013.897,41.84147,-87.67224
std,1235350.0,808350.5,691.6466,6.904691,13.80589,21.44029,18508.35,34247.75,1.449584,0.09430126,0.06661726
min,3.0,20224.0,111.0,1.0,1.0,0.0,0.0,0.0,2012.0,36.61945,-91.68657
25%,2698636.0,9002709.0,613.0,6.0,10.0,23.0,1152544.0,1858762.0,2013.0,41.76787,-87.71528
50%,3063654.0,9605776.0,1024.0,10.0,23.0,32.0,1166021.0,1891502.0,2014.0,41.85797,-87.66613
75%,3428849.0,10225770.0,1711.0,17.0,34.0,56.0,1176363.0,1908713.0,2015.0,41.90529,-87.62813
max,6253474.0,10827880.0,2535.0,31.0,50.0,77.0,1205119.0,1951573.0,2017.0,42.02271,-87.52453


In [108]:
len(chicago)

1456714

## sample 1/4 of the data

In [109]:
sample = chicago.copy()

In [110]:
# drop 3/4 rows of the data uniformly
drop_indices = np.random.choice(sample.index, int(3 * len(sample) / 4), replace=False)
sample = sample.drop(drop_indices)

In [111]:
len(sample)

364179

In [112]:
sample.head()

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
3,673,10508698,HZ250424,05/03/2016 10:10:00 PM,049XX W FULTON ST,460,BATTERY,SIMPLE,SIDEWALK,False,...,28.0,25.0,08B,1143223.0,1901475.0,2016,05/10/2016 03:56:50 PM,41.885687,-87.749516,"(41.885686845, -87.749515983)"
6,1130,10508703,HZ250489,05/03/2016 10:30:00 PM,027XX S STATE ST,460,BATTERY,SIMPLE,CHA HALLWAY/STAIRWELL/ELEVATOR,False,...,3.0,35.0,08B,1176730.0,1886544.0,2016,05/10/2016 03:56:50 PM,41.844024,-87.626923,"(41.844023772, -87.626923253)"
8,1868,10508709,HZ250523,05/03/2016 04:00:00 PM,014XX W DEVON AVE,460,BATTERY,SIMPLE,SIDEWALK,False,...,40.0,1.0,08B,1165696.0,1942616.0,2016,05/10/2016 03:56:50 PM,41.998131,-87.665814,"(41.99813061, -87.665814038)"
14,2477,10508728,HZ250505,05/03/2016 10:08:00 PM,016XX N CLAREMONT AVE,810,THEFT,OVER $500,STREET,False,...,1.0,24.0,06,1160444.0,1910787.0,2016,05/10/2016 03:56:50 PM,41.910901,-87.686019,"(41.910900826, -87.686018747)"
18,3242,10508747,HZ250577,05/03/2016 08:00:00 PM,100XX S SANGAMON ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,...,34.0,73.0,07,1171751.0,1838335.0,2016,05/10/2016 03:56:50 PM,41.711844,-87.646608,"(41.711843569, -87.646607932)"


In [113]:
sample.to_csv('chicago_sample.csv', sep=',')

## drop unnecessary columns

In [114]:
clean_sample = sample.copy()

In [115]:
clean_sample.drop(['X Coordinate', 'Y Coordinate', 'Location', 'Updated On', 'IUCR', 'Unnamed: 0', 'Block', 'Case Number', 'Ward', 'Community Area'], axis=1, inplace=True)

In [116]:
clean_sample.to_csv('chicago_clean_sample.csv', sep=',')

In [117]:
clean_sample.head()

Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,FBI Code,Year,Latitude,Longitude
3,10508698,05/03/2016 10:10:00 PM,BATTERY,SIMPLE,SIDEWALK,False,False,1532,15.0,08B,2016,41.885687,-87.749516
6,10508703,05/03/2016 10:30:00 PM,BATTERY,SIMPLE,CHA HALLWAY/STAIRWELL/ELEVATOR,False,False,133,1.0,08B,2016,41.844024,-87.626923
8,10508709,05/03/2016 04:00:00 PM,BATTERY,SIMPLE,SIDEWALK,False,False,2432,24.0,08B,2016,41.998131,-87.665814
14,10508728,05/03/2016 10:08:00 PM,THEFT,OVER $500,STREET,False,False,1434,14.0,06,2016,41.910901,-87.686019
18,10508747,05/03/2016 08:00:00 PM,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,2232,22.0,07,2016,41.711844,-87.646608


In [118]:
clean_sample.describe()

Unnamed: 0,ID,Beat,District,Year,Latitude,Longitude
count,364179.0,364179.0,364179.0,364179.0,355029.0,355029.0
mean,9598956.0,1150.264614,11.255259,2013.900126,41.841447,-87.672203
std,808610.3,691.640873,6.904382,1.450083,0.093274,0.065856
min,20227.0,111.0,1.0,2012.0,36.619446,-91.686566
25%,9002772.0,613.0,6.0,2013.0,41.767939,-87.7154
50%,9608623.0,1024.0,10.0,2014.0,41.857895,-87.666044
75%,10227090.0,1711.0,17.0,2015.0,41.904986,-87.628152
max,10827860.0,2535.0,31.0,2017.0,42.022586,-87.524529


# neighborhoods heatmap

In [119]:
chicago = pd.read_csv('data/chicago_clean_sample.csv')

In [120]:
chicago = chicago.dropna(subset=['Latitude', 'Longitude'])

In [121]:
chicago.head()

Unnamed: 0.1,Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,FBI Code,Year,Latitude,Longitude
0,1,10508695,05/03/2016 09:40:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,313,3.0,08B,2016,41.782922,-87.604363
1,3,10508698,05/03/2016 10:10:00 PM,BATTERY,SIMPLE,SIDEWALK,False,False,1532,15.0,08B,2016,41.885687,-87.749516
2,8,10508709,05/03/2016 04:00:00 PM,BATTERY,SIMPLE,SIDEWALK,False,False,2432,24.0,08B,2016,41.998131,-87.665814
3,9,10508982,05/03/2016 10:30:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,True,735,7.0,08B,2016,41.768097,-87.663879
4,27,10509077,05/03/2016 12:01:00 AM,DECEPTIVE PRACTICE,FRAUD OR CONFIDENCE GAME,RESIDENCE,False,False,915,9.0,11,2016,41.834586,-87.632285


In [122]:
crime_lats = chicago['Latitude']
crime_lons = chicago['Longitude']

In [123]:
crime_points = [Point(lon, lat) for lon, lat in zip(crime_lons, crime_lats)]

KeyboardInterrupt: 

In [None]:
crime_points_sample = choices(crime_points, k=5000)

In [None]:
fontsize=16
plt.title('locations of crimes in chicago', fontsize=fontsize)
plt.scatter([p.x for p in crime_points_sample], [p.y for p in crime_points_sample])

In [None]:
f = open('data/chicago_neighborhoods.json')
neighborhoods_json = json.load(f)

In [None]:
neighborhoods = [(nj[-4], list(shapely.wkt.loads(nj[-5]))[0]) for nj in neighborhoods_json['data']]

In [None]:
plt.title('neighborhoods in chicago', fontsize=fontsize)

for neighborhood in neighborhoods:
    polygon = neighborhood[1]
    x,y = polygon.exterior.xy
    plt.fill(x,y)

In [None]:
neighborhoods_crimes_count = {name: 0 for name, polygons in neighborhoods}

In [None]:
for point in crime_points:
    for neighborhood in neighborhoods:
        name, polygon = neighborhood
        if polygon.contains(point):
            neighborhoods_crimes_count[name] += 1

In [None]:
total_crimes_count = sum(neighborhoods_crimes_count.values())

In [None]:
minima = min(neighborhoods_crimes_count.values())
maxima = max(neighborhoods_crimes_count.values())

normalize = colors.Normalize(vmin=min(neighborhoods_crimes_count.values()),
                             vmax=max(neighborhoods_crimes_count.values()))
cmap = plt.cm.get_cmap('hot')

In [None]:
plt.figure(figsize=(9,7))
plt.title('heatmap of crimes in chicago', fontsize=fontsize)

for neighborhood in neighborhoods:
    name, polygon = neighborhood
    x,y = polygon.exterior.xy
    plt.fill(x,y, color=cmap(normalize(neighborhoods_crimes_count[name])))

In [None]:
plt.figure(figsize=(9,7))
plt.title('heatmap of crimes in chicago', fontsize=fontsize)
for neighborhood in neighborhoods:
    name, polygon = neighborhood
    x,y = polygon.exterior.xy
    plt.fill(x,y, color=cmap(normalize(neighborhoods_crimes_count[name])))
    plt.text(polygon.centroid.x, polygon.centroid.y, f'{int(100 * round(neighborhoods_crimes_count[name] / total_crimes_count, 2))}%', color='white')