In [1]:
# load standard libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
# load the data of manchester police
raw_data = pd.read_pickle('Manchester_street.pickle')
N = len(raw_data)

In [3]:
# summary info on the data
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3034429 entries, 0 to 3034428
Data columns (total 12 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Crime ID               object 
 1   Month                  object 
 2   Reported by            object 
 3   Falls within           object 
 4   Longitude              float64
 5   Latitude               float64
 6   Location               object 
 7   LSOA code              object 
 8   LSOA name              object 
 9   Crime type             object 
 10  Last outcome category  object 
 11  Context                object 
dtypes: float64(2), object(10)
memory usage: 277.8+ MB


In [4]:
# delete this before pushing to github

raw_data.head(15)

Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context
0,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.392346,53.63464,On or near Birches Road,E01032485,Blackburn with Darwen 018E,Anti-social behaviour,,
1,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.444807,53.611509,On or near Belmont Road,E01004768,Bolton 001A,Anti-social behaviour,,
2,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.444807,53.611509,On or near Belmont Road,E01004768,Bolton 001A,Burglary,,
3,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.440493,53.612388,On or near Threlkeld Road,E01004768,Bolton 001A,Vehicle crime,,
4,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.444807,53.611509,On or near Belmont Road,E01004768,Bolton 001A,Vehicle crime,,
5,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.440584,53.613781,On or near Wincanton Drive,E01004768,Bolton 001A,Vehicle crime,,
6,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.444807,53.611509,On or near Belmont Road,E01004768,Bolton 001A,Violent crime,,
7,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.444807,53.611509,On or near Belmont Road,E01004768,Bolton 001A,Other crime,,
8,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.442957,53.632548,On or near Albert Street,E01004803,Bolton 001B,Anti-social behaviour,,
9,,2010-12,Greater Manchester Police,Greater Manchester Police,-2.440607,53.630526,On or near Bedford Street,E01004803,Bolton 001B,Anti-social behaviour,,


In [5]:
len(raw_data['Crime ID'].unique()) / N

# conclusion ~33% of crimes have no ID

0.6621357757917552

In [6]:
# replace the Month column, with integer values
raw_data['year'] = pd.DatetimeIndex(raw_data['Month']).year
raw_data['month'] = pd.DatetimeIndex(raw_data['Month']).month
raw_data.drop('Month', axis=1, inplace=True)

In [7]:
# check these two columns, as they are the same we can delete them, as they can't influence the outcome prediction
raw_data['Reported by'].unique(), raw_data['Falls within'].unique()

(array(['Greater Manchester Police'], dtype=object),
 array(['Greater Manchester Police'], dtype=object))

In [8]:
# drop these two columns as they are useless
raw_data.drop(['Reported by', 'Falls within'], axis=1, inplace=True)

In [9]:
# as we can see Longitude has no missing values
raw_data[raw_data['Longitude'].isna()]

Unnamed: 0,Crime ID,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context,year,month


In [10]:
# as we can see Latitude has no missing values
raw_data[raw_data['Latitude'].isna()]

Unnamed: 0,Crime ID,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context,year,month


In [11]:
# we have multiple variables that give location data, location as a column is non specific and needs to be encoded on top to be worked with
# therefore we can drop the column
raw_data.drop('Location', axis=1, inplace=True)

In [12]:
# short tests with LSOA codes/names
code = 'E01032485'
raw_data[raw_data['LSOA code'] == code]['LSOA name'].unique()

array(['Blackburn with Darwen 018E'], dtype=object)

In [13]:
len(raw_data['LSOA code'].unique()), len(raw_data['LSOA name'].unique())

# for now we will assume that they are identical and we can drop one column, this needs to be further investigated

(1710, 1710)

In [14]:
raw_data.drop('LSOA code', axis=1, inplace=True)

In [15]:
# one third of the dataset does not have an outcome category, as we are currently NOT specifying crime type that importantly
# we can drop the column now, but if we think crime type is very important, we will have to add it back in, for more nuance
# we would also have to further look into the nan values then
raw_data[raw_data['Last outcome category'].isna()]

Unnamed: 0,Crime ID,Longitude,Latitude,LSOA name,Crime type,Last outcome category,Context,year,month
0,,-2.392346,53.634640,Blackburn with Darwen 018E,Anti-social behaviour,,,2010,12
1,,-2.444807,53.611509,Bolton 001A,Anti-social behaviour,,,2010,12
2,,-2.444807,53.611509,Bolton 001A,Burglary,,,2010,12
3,,-2.440493,53.612388,Bolton 001A,Vehicle crime,,,2010,12
4,,-2.444807,53.611509,Bolton 001A,Vehicle crime,,,2010,12
...,...,...,...,...,...,...,...,...,...
3034407,,-2.564641,53.474504,Wigan 040D,Anti-social behaviour,,,2019,6
3034408,,-2.564641,53.474504,Wigan 040D,Anti-social behaviour,,,2019,6
3034409,,-2.560912,53.475070,Wigan 040D,Anti-social behaviour,,,2019,6
3034410,,-2.564641,53.474504,Wigan 040D,Anti-social behaviour,,,2019,6


In [16]:
raw_data.drop('Last outcome category', axis=1, inplace=True)

In [17]:
raw_data[raw_data['Context'] == raw_data['Context'].unique()[1]]

Unnamed: 0,Crime ID,Longitude,Latitude,LSOA name,Crime type,Context,year,month
55239,,-2.444807,53.611509,Bolton 001A,Burglary,CrimeMapper has moved this record to a locatio...,2011,2
55250,,-2.428078,53.622221,Bolton 001D,Other crime,CrimeMapper has moved this record to a locatio...,2011,2
55533,,-2.388744,53.609646,Bolton 006D,Other crime,CrimeMapper has moved this record to a locatio...,2011,2
55546,,-2.557438,53.594984,Bolton 007A,Burglary,CrimeMapper has moved this record to a locatio...,2011,2
55553,,-2.557438,53.594984,Bolton 007A,Other crime,CrimeMapper has moved this record to a locatio...,2011,2
...,...,...,...,...,...,...,...,...
116263,,-2.587383,53.478332,Wigan 038D,Other crime,CrimeMapper has moved this record to a locatio...,2011,3
116391,,-2.565764,53.470625,Wigan 040E,Burglary,CrimeMapper has moved this record to a locatio...,2011,3
116392,,-2.565764,53.470625,Wigan 040E,Burglary,CrimeMapper has moved this record to a locatio...,2011,3
116393,,-2.565764,53.470625,Wigan 040E,Vehicle crime,CrimeMapper has moved this record to a locatio...,2011,3


In [18]:
raw_data[raw_data['Context'] == raw_data['Context'].unique()[2]]

Unnamed: 0,Crime ID,Longitude,Latitude,LSOA name,Crime type,Context,year,month
117201,b85812176163f5fa570ff3b8f5390a8e24d1849d3302ec...,-2.441435,53.592565,Bolton 011C,Other crime,This record has been moved to a location that ...,2011,4
117811,042a52d1d054f6416c36533df2ffd55b1732e8c76684b3...,-2.428325,53.576461,Bolton 016D,Violent crime,This record has been moved to a location that ...,2011,4
117851,e2de35589f6ba71723926e63f038cd106fa28957b735a5...,-2.428325,53.576461,Bolton 016D,Other crime,This record has been moved to a location that ...,2011,4
118748,0203ab3e20f2d9f4dd3c629d918363a38c048e82422c49...,-2.450872,53.567911,Bolton 023A,Other crime,This record has been moved to a location that ...,2011,4
119139,93bf84b206e3a8fafab2773e723917cbf52bcbc0681a16...,-2.407093,53.554666,Bolton 027C,Vehicle crime,This record has been moved to a location that ...,2011,4
...,...,...,...,...,...,...,...,...
428920,08a561787641217e9c14f977b5d0d7ffb630d2fbec206c...,-2.646118,53.545757,Wigan 015A,Criminal damage and arson,This record has been moved to a location that ...,2012,2
429185,a3c1f1acc5f904754063b7cae26a6f0a63f324fce8f7a5...,-2.665387,53.502291,Wigan 018G,Other theft,This record has been moved to a location that ...,2012,2
429753,b1d9fedbbe160639023f0b06ed5ed8fa0bbd3651205ff6...,-2.452878,53.494171,Wigan 029B,Burglary,This record has been moved to a location that ...,2012,2
429931,6de59d05d78163606e9c383d0893788bf76e3fc45c5bc2...,-2.492890,53.500008,Wigan 031A,Vehicle crime,This record has been moved to a location that ...,2012,2


After analysing the `Context` column, it can be concluded that some crimes have been allocated the nearest possible location, as the true location of the crime could not be properly mapped. Therefore, as the location is not precise, we consider this data to be faulty and we will delete these data entries. 

In [19]:
# only keep non context values
raw_data = raw_data[raw_data['Context'].isna()].copy()

In [20]:
# now we can drop the Context column, as it doesn't hold anymore information
raw_data.drop('Context', axis=1, inplace=True)

In [21]:
# after some consideration, we decided to drop the crime ID column for now, it can easily be put back in if we want to change our approach
raw_data.drop('Crime ID', axis=1, inplace=True)

In [23]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(raw_data, test_size=0.2, random_state=15)

#### DO NOT CHANGE RANDOM STATE
#### use train set from now on, do not touch test_set, do not plot test_set, completely ignore it!!!!

# notes to investigate, do we need to stratify the data
# possible strata are, crime type and LSAO name
# set up a validation set, maybe cross validation

In [27]:
# it is easiest if you just use df_police from now on, it makes it easier to reload the initial train set, if you want to assign it to 
# extra vairables, make sure to use .copy() so you don't get errors/slow down your program
df_police = train_set.copy()

In [50]:
# To use the categorical data you will have to use an encoder. Make sure to use the one you feel familiar with
# tips: sklearn has multiple different ones, Ordinal or OneHot encoder might be good for this problem