In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns

from pyproj import Proj, transform

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import DBSCAN

import bokeh
import bokeh.plotting as plotting
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.tile_providers import CARTODBPOSITRON
plotting.output_notebook()

sns.set_style('whitegrid')



# Problem definition

Cluster regions based on crime data

http://donnees.ville.montreal.qc.ca/dataset/actes-criminels

# Load the data

In [2]:
#input
#df = pd.read_csv('data/interventionscitoyendo.csv', encoding='latin_1')
df = pd.read_csv('declarations-exterminations-punaises-de-lit-1.csv', encoding='latin_1')
#df['DATE'] = pd.to_datetime(df['DATE'])

df['DATE_DECLARATION'] = pd.to_datetime(df['DATE_DECLARATION'])
df['DATE_PRIOR_INSP'] = pd.to_datetime(df['DATE_PRIOR_INSP'])
df['DATE_FIRST_EXT'] = pd.to_datetime(df['DATE_FIRST_EXT'])
df['DATE_LAST_EXT'] = pd.to_datetime(df['DATE_LAST_EXT'])

print(df.columns)
#print(df['CATEGORIE'].value_counts())
print(df['BORO_NAME'].value_counts())
df.head()

Index(['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_PRIOR_INSP', 'EXT_FREQ',
       'DATE_FIRST_EXT', 'DATE_LAST_EXT', 'HOOD_NUM', 'HOOD_NAME', 'BORO_NAME',
       'MTM8_X', 'MTM8_Y', 'LONGITUDE', 'LATITUDE', 'LONG_LAT', 'MTM_X_Y',
       'DEC_MONTH', 'DEC_ISSUE', 'DATE_DIFF'],
      dtype='object')
Le Plateau-Mont-Royal                          24
RosemontâLa Petite-Patrie                    23
VillerayâSaint-MichelâParc-Extension       21
MercierâHochelaga-Maisonneuve                21
Ville-Marie                                    14
CÃ´te-des-NeigesâNotre-Dame-de-GrÃ¢ce        12
Le Sud-Ouest                                    9
MontrÃ©al-Nord                                  6
Ahuntsic-Cartierville                           5
Saint-LÃ©onard                                  3
Saint-Laurent                                   3
Verdun                                          2
LaSalle                                         2
Anjou                                           2


Unnamed: 0,NO_DECLARATION,DATE_DECLARATION,DATE_PRIOR_INSP,EXT_FREQ,DATE_FIRST_EXT,DATE_LAST_EXT,HOOD_NUM,HOOD_NAME,BORO_NAME,MTM8_X,MTM8_Y,LONGITUDE,LATITUDE,LONG_LAT,MTM_X_Y,DEC_MONTH,DEC_ISSUE,DATE_DIFF
0,4254,2012-10-28,2012-09-21,1,2012-09-21,2012-09-21,24,Beaurivage,MercierâHochelaga-Maisonneuve,303753.6,5049835.7,-73.513411,45.588426,-73.513411_45.588426,303753.6_5049835.7,10,1,37
1,830,2011-09-16,2011-07-13,1,2011-07-27,2011-08-17,50,Saint-Henri,Le Sud-Ouest,298119.8,5036963.7,-73.585437,45.472569,-73.585437_45.472569,298119.8_5036963.7,9,1,65
2,1380,2011-11-08,2011-11-02,1,2011-11-07,2011-11-21,30,Sainte-Marie,Ville-Marie,300294.9,5042372.1,-73.557668,45.521253,-73.557668_45.521253,300294.9_5042372.1,11,1,6
3,455,2011-08-10,2011-08-09,1,2011-08-09,2011-08-09,44,Upper Lachine,CÃ´te-des-NeigesâNotre-Dame-de-GrÃ¢ce,296046.9,5036494.9,-73.611941,45.468327,-73.611941_45.468327,296046.9_5036494.9,8,1,1
4,1243,2011-10-26,2011-09-16,1,2011-10-05,2011-10-05,19,Petite-CÃ´te,RosemontâLa Petite-Patrie,299524.2,5045639.9,-73.567569,45.550652,-73.56756899999999_45.550652,299524.2_5045639.9,10,1,40


In [3]:


#DATE_DECLARATION MTM8_X	MTM8_Y	LONGITUDE	LATITUDE

#del df['DATE_DECLARATION']
#del df['MTM8_X']
#del df['MTM8_Y']
#del df['LONGITUDE'] 
#del df['LATITUDE']

#del df['DEC_D'] 
#del df['PRIOR_D'] 
#del df['FIRST_D'] 
#del df['LAST_D'] 
del df['NO_DECLARATION']
del df['DATE_PRIOR_INSP']
del df['EXT_FREQ']
del df['DATE_FIRST_EXT']
del df['DATE_LAST_EXT']
del df['HOOD_NUM']
del df['HOOD_NAME'] 
#del df['BORO_NAME'] 
del df['LONG_LAT'] 
del df['MTM_X_Y']
del df['DEC_MONTH']
del df['DEC_ISSUE'] 
del df['DATE_DIFF'] 

print(df.columns)

Index(['DATE_DECLARATION', 'BORO_NAME', 'MTM8_X', 'MTM8_Y', 'LONGITUDE',
       'LATITUDE'],
      dtype='object')


# Feature Engineering 

In [4]:
# feature engineering

# select a period
#df = df[df['DATE']>='2018-01-01']
df = df[df['DATE_DECLARATION']>='2012-01-01']

# select the categories
#df = df[df['CATEGORIE']==u'Vols qualifiés']
df = df[df['BORO_NAME']==u'Verdun']

# remove lines with no location
#df = df[(df['X']>0)&(df['Y']>0)]
df = df[(df['MTM8_X']>0)&(df['MTM8_Y']>0)]


# adapt X and Y to the visualization
#df['X'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['LONG'], x['LAT'])[1], axis=1)
#df['Y'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['LONG'], x['LAT'])[0], axis=1)
df['MTM8_X'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['LONGITUDE'], x['LATITUDE'])[1], axis=1)
df['MTM8_Y'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['LONGITUDE'], x['LATITUDE'])[0], axis=1)


#X_columns = ['X', 'Y']
X_columns = ['MTM8_X', 'MTM8_Y']


df = df[X_columns]

# Model Training

In [5]:
model = DBSCAN(eps=1.0, min_samples=100)
#model.fit(df[['X', 'Y']])
model.fit(df[['MTM8_X', 'MTM8_Y']])


cluster_labels = model.labels_
n_clusters = len(set(cluster_labels))
print(collections.Counter(cluster_labels))

df['cluster'] = cluster_labels

Counter({-1: 2})


In [6]:
p = figure(y_range=(5641788.0, 5751788.0), x_range=(-8152883, -8252883))
p.add_tile(CARTODBPOSITRON)

#latitude  = list(df[df['cluster']>-1]['X'].values)
#longitude = list(df[df['cluster']>-1]['Y'].values)

latitude  = list(df[df['cluster']>-1]['MTM8_X'].values)
longitude = list(df[df['cluster']>-1]['MTM8_Y'].values)

colormap = list(bokeh.palettes.viridis(n_clusters))
colors = [colormap[x] for x in df[df['cluster']>-1]['cluster']]
source = ColumnDataSource(data=dict(longitude=longitude, latitude=latitude))
p.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=5)
show(p)

# Model Evaluation

In [7]:
# Inter-Cluster
centroids = []
for cluster in sorted(set(model.labels_)):
    centroids.append(df[df['cluster']==cluster][X_columns].mean().values)
distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])
print('Inter Cluster distance', np.mean(distances))

# Intra-Cluster
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])
print('Intra Cluster distance', np.mean(distances))

# Inertia
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(1, -1), v.values.reshape(1, -1), squared=True)[0][0])
print('Inertia', np.sum(distances))

Inter Cluster distance 0.0
Intra Cluster distance 547.6135316791624
Inertia 1002676.1875
