# Montreal crime data analysis and visualization
![Downtown Montreal](./images/montreal.jpg)

Compared to most other North American cities, Montreal is a safe place. The odds of you being a homicide victim there are [literally one in a million](https://www.cbc.ca/news/canada/montreal/what-3-years-of-detailed-crime-data-tells-us-about-how-safe-a-city-montreal-is-1.4627438) (compared to [Tampa, where the odds are 80 times higher](http://www.city-data.com/crime/crime-Tampa-Florida.html), or [Orlando, where it’s 300 times higher](http://www.city-data.com/crime/crime-Orlando-Florida.html)).

Over the past couple of years, the Montreal police have been publishing details on certain crimes committed there. We’re going to use that data to generate an interactive crime map.

## Import packages and define utility functions
![Setup](./images/montreal-setup.jpg)

In [None]:
# Import packages
# ===============
import pandas as pd
import numpy as np
import folium
from folium import plugins
from IPython.display import display_html, HTML
from geopy.geocoders import Nominatim
import json
import plotly
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from dateutil.parser import parse
import calendar


# Constants
# =========
# Define chart marker colors
markercol = ['rgba(31, 119, 180, 0.5)', 'rgba(255, 127, 14, 0.5)',
             'rgba(50, 171, 96, 0.5)', 'rgba(214, 39, 40, 0.5)',
            'rgba(148, 103, 189, 0.5)', 'rgba(140, 86, 75, 0.5)']
linecol = ['rgba(31, 119, 180, 1.0)', 'rgba(255, 127, 14, 1.0)',
            'rgba(50, 171, 96, 1.0)', 'rgba(214, 39, 40, 1.0)',
            'rgba(148, 103, 189, 1.0)', 'rgba(140, 86, 75, 1.0)']


# Settings
# ========
init_notebook_mode(connected=True)
pd.set_option('display.max_colwidth', 130)


# Map functions
# =============
def map_data(mappings, x):
    '''a function to map columns descriptions in French to English'''
    for i, j in mappings:
        if i == x:
            return j

def embed_map(map):
    '''a function to embed map in notebook '''
    map.save(outfile="map.html")
    return HTML('<iframe src="{i}" style="width: 100%; height: 510px; border: none"></iframe>'.format(i="map.html"))

def generate_map(df, yr):
    '''a function for creating an interactive map'''

    # exclude null location values
    ref = df[(df['YEAR'] == yr) & (df['COORDS'] != (1.0, 1.0))].copy()

    # create base map
    crimemap = folium.Map(location=[ref['LAT'].mean(), ref['LON'].mean()], zoom_start=11)

    # create an instance of marker cluster for crimes in the dataset
    crimes = plugins.MarkerCluster().add_to(crimemap)

    # loop through the dataset and add each crime point to the marker cluster
    for lat, lon, category in zip(ref['LAT'], ref['LON'], ref['ADAPTED_CATEGORY']):
        folium.Marker(location=[lat, lon], icon=None, popup=category).add_to(crimes)
        
    return embed_map(crimemap)


# Reverse geocoding
# =================
# Reverse geocoding is the process of back (reverse) coding of 
# a point location (latitude, longitude) to a readable address
# or place name. This permits the identification of nearby 
# street addresses, places, and/or areal subdivisions such as 
# neighbourhoods, county, state, or country.
def extract_address(col):
    '''a function to extract addresses from latitudes and longitudes'''
    coord = list(col)
    slist = []
    geolocator = Nominatim()
    for i in coord:
        jlist = []
        location = geolocator.reverse(i, timeout=10)

        # write results as json strings
        json_string = json.dumps(location.raw)

        # convert json to dict
        dat = json.loads(json_string)

        # extract neighborhood address
        for j in dat['address'].keys():
            if j not in ['house_number', 'city', 'region', 'state', 'postcode','country', 'country_code']:
                jlist.append(dat['address'][j])
        locstr = ", ".join(jlist)
        slist.append(locstr)
    return slist


# Plotting
# ========
def plotchart(chdata, chlayout, titlelist, yearlist, subtitlelist):
    '''a function to plot a chart with 1 row and 3 columns figure'''

    # define subplots
    fig = tools.make_subplots(rows=1, cols=3,
        subplot_titles=(["<b>{}</b>".format(i) for i in subtitlelist]), 
        shared_yaxes=True,horizontal_spacing=(0.05),print_grid=False)

    # an empty list to hold chart data definitions for each plot
    trace_list = []    
    for i in range(3):
        data = chdata['trace_data'][chdata['trace_data']['YEAR'] == yearlist[i]]
        tracex = go.Bar(x=data[chdata['x']], y=data[chdata['y']], name=titlelist[i], width=0.7,
            text=data[chdata['y']], textposition='outside', hoverinfo='text',
            outsidetextfont=dict(size='10'), cliponaxis=False,
            marker=dict(color=markercol[:3][i], line=dict(color=linecol[:3][i], width=1)))
        trace_list.append(tracex)

        # define each subplot order of selection
        m = np.array([1, 1, 1, 2, 1, 3]).reshape(3, 2)

        # append each subplot data definitions to the figure instance
        fig.append_trace(trace_list[i], m[i][0], m[i][1])

    # define layout settings
    for i in fig['layout']['annotations']:
        i['font'] = dict(size=12)

    for i in range(1, 4):
        fig['layout']['yaxis' + '{}'.format(i)].update(title=chlayout['yaxistitle'],
            titlefont=dict(size=12, color='rgb(107, 107, 107)'), showticklabels=False, showgrid=True)

        fig['layout']['xaxis' + '{}'.format(i)].update(titlefont=dict(size=11, color='rgb(107, 107, 107)'),
            tickfont=dict(size=11, color='rgb(107, 107, 107)'), tickangle=chlayout['tickangle'])

    # update layout settings
    fig['layout'].update(height=chlayout['height'], width=chlayout['width'], showlegend=False,
        autosize=False, title=chlayout['title'], titlefont=dict(size=14),
        paper_bgcolor='rgba(245, 246, 249, 1)', plot_bgcolor='rgba(245, 246, 249, 1)')

    return iplot(fig)

## Getting data from the source
![Data](./images/montreal-data.jpg)

We’ll use a dataset of crimes committed in the city of Montreal from 2015 through part of 2018. It comes from Montreal’s police department, [and you can download it from the Montreal Open Data Portal](http://donnees.ville.montreal.qc.ca/dataset/actes-criminels) in .csv format.

Let’s read in the dataset and print out the first few rows.

In [None]:
CSV_FILE_LOCATION = './data/interventionscitoyendo.csv'
#CSV_FILE_LOCATION = 'https://raw.githubusercontent.com/AccordionGuy/DevFestFlorida2019/master/data/interventionscitoyendo.csv'

df = pd.read_csv(CSV_FILE_LOCATION, encoding='latin1')
df.head()

## Column descriptions
Keep in mind that this data comes from the _Montreal_ police department, so the it’s in French.

Here’s a quick rundown of the columns:
* **CATEGORIE:** Category, which will contain one of the following values:
    * ***Introduction:*** Breaking and entering a public institution or private residence, theft of a firearm in a residence.
    * ***Vol dans / sur véhicule à moteur:*** Theft of a motor vehicle (car, truck, motorcycle, etc.) contents or a vehicle part (wheel, bumper, etc.).
    * ***Vol de véhicule à moteur:*** Theft of a car, truck, motorcycle, snowmobile tractor with or without a trailer, construction or farm vehicle, all-terrain vehicle.
    * ***Méfait:*** Graffiti and damage to religious property, vehicle or general damage and all other types of mischief.
    * ***Vol qualifié:*** Robbery accompanied by commercial violence, financial institution, person, purse, armored vehicle, vehicle, firearm, and all other types of robbery.
    * ***Infraction entraînant la mort:*** First-degree murder, second-degree murder, manslaughter, infanticide, criminal negligence, and all other types of offenses resulting in death.
* **DATE:** Date of the report of the event to the police.
* **QUART:** The time of the day the event was reported to the police.
* **PDQ:** Number of the neighborhood station covering the territory where the event took place. For example, the neighborhood station 50 corresponds to the unit in charge of the metro.
* **X**, **Y:** Geospatial position according to MTM8 projection (SRID 2950). The value 0 is used when no geographical position was provided when entering the information.
* **LAT** and **LONG:** Geographical position of the event after obfuscation at an intersection according to the WGS84 geodesic datum. The value 1 is used when no geographical position has been provided when entering the information.

## How many crimes are in this dataset?

In [None]:
# Display the number of records in the dataset.
print('The dataset contains {0} rows and {1} columns.'.format(df.shape[0], df.shape[1]))

## What are the dataframe’s columns’ data types? Are there any missing values in the dataframe?

In [None]:
# check for missing values and data types of the columns
df.info()

## I don’t speak French very well. Can we translate this to English?
![Parlez-vous Francais?](./images/montreal-francais.jpg)

Let’s create some mappings to translate the French phrases to English.

In [None]:
# Translate from French to English
# ================================

# Change the crime categories from French to English.
crime_mappings = list(zip([
        'Introduction', 'Vol dans / sur véhicule à moteur',
        'Vol de véhicule à moteur', 'Méfait', 'Vols qualifiés',
        'Infractions entrainant la mort'
    ], [
        'Burglary', 'Vehicle contents or parts theft', 'Vehicle theft',
        'Misdemeanor', 'Robbery', 'Offenses causing death'
    ]))

# Change the terms for various times of day from French to English.
day_mappings = list(zip(['jour', 'soir', 'nuit'], ['day', 'evening', 'night']))

# Create a new column, ADAPTED_CATEGORY, for crime descriptions translated into English.
df['ADAPTED_CATEGORY'] = df['CATEGORIE'].apply(
    lambda x: map_data(crime_mappings, x))

# Modify QUART column.
df['QUART'] = df['QUART'].apply(lambda x: map_data(day_mappings, x))

## Let’s use the DATE column to create two new columns, MONTH and YEAR

In [None]:
# Turn date field from object to date data type.
df['DATE'] = df['DATE'].apply(lambda x: parse(x))

# Define a new column 'YEAR',
df['YEAR'] = df['DATE'].apply(lambda x: x.year).astype(str)

# Define a new column 'MONTH'.
df['MONTH'] = df['DATE'].apply(lambda x: x.month)

## Let’s make a couple of column changes

In [None]:
# Modify 'PDQ' column.
df['PDQ'] = df['PDQ'].apply(lambda x: 'PDQ '+ str(x))

# Drop X and Y columns.
df.drop(['X', 'Y'], axis=1, inplace=True)

## Let’s limit the data to crimes from 2015 through 2017

In [None]:
# Extract only '2015 - 2017' records from the dataset.
xdf = df[df['YEAR'] != '2018'].copy()

### Just a little more column stuff...

In [None]:
# chain latitudes and longitudes together as a new column
xdf['COORDS'] = list(zip(xdf['LAT'], xdf['LONG']))

# summarize data
aggcrime = xdf.groupby(['COORDS', 'ADAPTED_CATEGORY', 'YEAR']).agg({'ADAPTED_CATEGORY':'count'}).rename(columns={'ADAPTED_CATEGORY': 'COUNT'}).reset_index()

# split 'COORDS' into two columns
aggcrime['LAT'] = aggcrime['COORDS'].apply(lambda x: str(x).split(",")[0].replace("(", "")).astype(float)
aggcrime['LON'] = aggcrime['COORDS'].apply(lambda x: str(x).split(",")[1].replace(")", "")).astype(float)

## What does the dataframe look like now?

In [None]:
aggcrime.head()

## All right, let’s go map some crime!
With the map data formatted and map generation routines defined, we can create an interactive map.

* Use the **+** and **-** buttons or scroll up or down to zoom.
* The numbers represent the aggregate number of crimes in the area. Click on them to zoom into that area.
* The blue map markers represent individual crimes. Click on them to display the type of crime.

In [None]:
# a crime map of 2015 is shown here. Maps of 2016 and 2017 can be displayed by changing the year value accordingly.
generate_map(aggcrime, '2015')

![Works on my machine!](./images/montreal-works_on_my_machine.png)

## Sources
Go check out the original on GitHub at [dimtics/Montreal-City-Crime-Data-Analysis.](https://github.com/dimtics/Montreal-City-Crime-Data-Analysis)