In [1]:
# fetch dataset imports
from google.oauth2 import service_account
import pandas_gbq

# other
import time
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

Aim:
1. Full chicago dataset (cleaned with all columns) 
2. Full chicago dataset with police data (cleaned with all columns)
3. Trimmed chicago dataset
4. Trimmed chicago dataset with police data



## Fetch the Chicago dataset
(No execution)

In [2]:
%%perl -e0
api_key_path = '../../api-key/bigquery-332415-dc899a25b253.json'
credentials = service_account.Credentials.from_service_account_file(api_key_path)
project_id = 'bigquery-332415' # The project id which belongs to my service account on Google Cloud

query = """
    SELECT *
    FROM `bigquery-public-data.chicago_crime.crime`
"""

start_time = time.time()
df = pandas_gbq.read_gbq(query, project_id=project_id, credentials=credentials)
end_time = time.time()
result = end_time - start_time

print('time:', end_time - start_time)

df.to_csv('../../datasets/chicago.csv', index=False)


## Load the dataset

In [3]:
csv_file = '../datasets/chicago.csv'
t1 = time.time()
df = pd.read_csv(csv_file)
t2 = time.time()
print(f'time: {t2-t1}')

time: 30.22612500190735


In [4]:
%%perl -e0
t1 = time.time()
df = dt.fread(csv_file).to_pandas()
t2 = time.time()
t2-t1

In [5]:
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,8117160,HT351468,2011-06-17 02:00:00+00:00,049XX S HONORE ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,16.0,61.0,7,1164845.0,1872176.0,2011,2018-02-10 03:50:01+00:00,41.804856,-87.670945,"(41.804856393, -87.670945337)"
1,8438070,HV115666,2012-01-12 09:00:00+00:00,010XX W 50TH ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,16.0,61.0,7,1170027.0,1871655.0,2012,2018-02-10 03:50:01+00:00,41.803316,-87.651955,"(41.803315551, -87.651955176)"
2,11741180,JC328821,2019-06-30 02:35:00+00:00,051XX S WOLCOTT AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,16.0,61.0,7,1164556.0,1870514.0,2019,2019-07-07 04:14:54+00:00,41.800302,-87.672052,"(41.800301773, -87.672052162)"
3,8083023,HT315648,2011-05-26 09:00:00+00:00,051XX S HOYNE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,16.0,61.0,7,1163232.0,1870403.0,2011,2016-02-04 06:33:39+00:00,41.800025,-87.676911,"(41.800025024, -87.676910797)"
4,4676039,HM274833,2006-04-06 09:00:00+00:00,009XX W 54TH ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,20.0,61.0,7,1171042.0,1868997.0,2006,2018-02-10 03:50:01+00:00,41.796,-87.64831,"(41.795999571, -87.648310381)"


In [6]:
df.isnull().sum()

unique_key                   0
case_number                  4
date                         0
block                        0
iucr                         0
primary_type                 0
description                  0
location_description      8715
arrest                       0
domestic                     0
beat                         0
district                    47
ward                    614838
community_area          613555
fbi_code                     0
x_coordinate             74973
y_coordinate             74973
year                         0
updated_on                   0
latitude                 74857
longitude                74857
location                 74857
dtype: int64

In [7]:
df = df[:10000]

In [8]:
# Possiblity to test dataset with and without dropped values
df = df.dropna()

In [9]:
df.isnull().sum()

unique_key              0
case_number             0
date                    0
block                   0
iucr                    0
primary_type            0
description             0
location_description    0
arrest                  0
domestic                0
beat                    0
district                0
ward                    0
community_area          0
fbi_code                0
x_coordinate            0
y_coordinate            0
year                    0
updated_on              0
latitude                0
longitude               0
location                0
dtype: int64

In [10]:
# Possiblity to test dataset with and without unique_key
#df = df.drop(['unique_key'], axis=1)

## Date
Extract time of month

Extract day of week

Convert date to epoch

In [11]:
# check to see if all dates are the same length
df['date'].str.len().unique()

array([25])

In [12]:
# remove time
df['date'] = df['date'].str.slice(stop=10)
df['updated_on'] = df['updated_on'].str.slice(stop=10)

# convert str to datetime object
df['date'] = pd.to_datetime(df['date'])
df['updated_on'] = pd.to_datetime(df['updated_on'])

# get day of week
df['date_day_of_week'] = df['date'].dt.dayofweek

# get time of month
# if first cond is true take the first value from choice list
# if second cond is true take the second value from choice list
# else take default
day = df['date'].dt.day
df['time_of_month'] = np.select(condlist=[day < 10, day < 20], choicelist=[0,1], default=2)
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,date_day_of_week,time_of_month
0,8117160,HT351468,2011-06-17,049XX S HONORE ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1164845.0,1872176.0,2011,2018-02-10,41.804856,-87.670945,"(41.804856393, -87.670945337)",4,1
1,8438070,HV115666,2012-01-12,010XX W 50TH ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1170027.0,1871655.0,2012,2018-02-10,41.803316,-87.651955,"(41.803315551, -87.651955176)",3,1
2,11741180,JC328821,2019-06-30,051XX S WOLCOTT AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1164556.0,1870514.0,2019,2019-07-07,41.800302,-87.672052,"(41.800301773, -87.672052162)",6,2
3,8083023,HT315648,2011-05-26,051XX S HOYNE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1163232.0,1870403.0,2011,2016-02-04,41.800025,-87.676911,"(41.800025024, -87.676910797)",3,2
4,4676039,HM274833,2006-04-06,009XX W 54TH ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1171042.0,1868997.0,2006,2018-02-10,41.796,-87.64831,"(41.795999571, -87.648310381)",3,0


In [13]:
# convert to UNIX/EPOCH
df['date'] = (df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df['updated_on'] = (df['updated_on'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,date_day_of_week,time_of_month
0,8117160,HT351468,1308268800,049XX S HONORE ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1164845.0,1872176.0,2011,1518220800,41.804856,-87.670945,"(41.804856393, -87.670945337)",4,1
1,8438070,HV115666,1326326400,010XX W 50TH ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1170027.0,1871655.0,2012,1518220800,41.803316,-87.651955,"(41.803315551, -87.651955176)",3,1
2,11741180,JC328821,1561852800,051XX S WOLCOTT AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1164556.0,1870514.0,2019,1562457600,41.800302,-87.672052,"(41.800301773, -87.672052162)",6,2
3,8083023,HT315648,1306368000,051XX S HOYNE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1163232.0,1870403.0,2011,1454544000,41.800025,-87.676911,"(41.800025024, -87.676910797)",3,2
4,4676039,HM274833,1144281600,009XX W 54TH ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7,1171042.0,1868997.0,2006,1518220800,41.796,-87.64831,"(41.795999571, -87.648310381)",3,0


### Convert booleans to int

In [14]:
df['arrest'] = df['arrest'].astype(int) # convert boolean to int
df['domestic'] = df['domestic'].astype(int) # convert boolean to int

### Convert latitude and longitude to ratio data

In [15]:
# removing location because we have lat and long
df = df.drop(['location'], axis=1)
# we should also remove lat and long after

### Save dataset 1: Full chicago dataset

In [16]:
# optional step to save to csv
df.to_csv('chicago_cleaned.csv', index=False)

## Merge police data

In [17]:
path = '../datasets/law_enforcement_stats.xlsx'
df_police = pd.read_excel(path)
df_police

Unnamed: 0,Year,Population,Total Law Enforcement,Officers,Civilians
0,2001,2895995,15066.0,13581.0,1485.0
1,2002,2881295,14932.0,13609.0,1323.0
2,2003,2866361,14777.0,13553.0,1224.0
3,2004,2848996,14325.0,13326.0,999.0
4,2005,2824584,14442.0,13267.0,1175.0
5,2006,2806391,14692.0,13624.0,1068.0
6,2007,2824434,14736.0,13671.0,1065.0
7,2008,2829304,14307.0,13359.0,948.0
8,2009,2848431,13960.0,13088.0,872.0
9,2010,2833649,13318.0,12515.0,803.0


### Find mean and add it to the missing rows
To find the average for 2013 i'm taking the mean of 2012 and 2014 instead of the overall mean. This is to get a more precise number of the specific timeframe. 

In [18]:
officerMean2013 = (df_police['Officers'].iloc[11] + df_police['Officers'].iloc[13]) / 2
civilanMean2013 = (df_police['Civilians'].iloc[11] + df_police['Civilians'].iloc[13]) / 2

officerMean2013, civilanMean2013

(11989.0, 845.0)

In [19]:
officerMean2015 = (df_police['Officers'].iloc[13] + df_police['Officers'].iloc[15]) / 2
civilanMean2015 = (df_police['Civilians'].iloc[13] + df_police['Civilians'].iloc[15]) / 2

officerMean2015, civilanMean2015

(11994.0, 1024.5)

In [20]:
df_police.loc[[12], 'Officers'] = officerMean2013
df_police.loc[[12], 'Civilians'] = civilanMean2013
df_police.loc[[14], 'Officers'] = officerMean2015
df_police.loc[[14], 'Civilians'] = civilanMean2015

In [21]:
df_police = df_police.drop(['Total Law Enforcement'], axis=1)

In [22]:
# renaming columns to match the case convetion from chicago dataset
df_police = df_police.rename(columns={"Year":"year"})
df_police = df_police.rename(columns={"Population":"population"})
df_police = df_police.rename(columns={"Officers":"officers"})
df_police = df_police.rename(columns={"Civilians":"civilians"})

### Merge the 2 datasets
Adding the civilians, officers and population features to the corresponding year in the chicago dataset 

In [23]:
df = pd.merge(df, df_police, on='year')
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,y_coordinate,year,updated_on,latitude,longitude,date_day_of_week,time_of_month,population,officers,civilians
0,8117160,HT351468,1308268800,049XX S HONORE ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,0,0,...,1872176.0,2011,1518220800,41.804856,-87.670945,4,1,2703713,12092.0,707.0
1,8083023,HT315648,1306368000,051XX S HOYNE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,0,0,...,1870403.0,2011,1454544000,41.800025,-87.676911,3,2,2703713,12092.0,707.0
2,8292372,HT526665,1317600000,019XX W 50TH ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,0,0,...,1871523.0,2011,1518220800,41.803072,-87.672313,0,0,2703713,12092.0,707.0
3,8148792,HT383629,1309824000,043XX S MARSHFIELD AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,0,0,...,1875881.0,2011,1518220800,41.814997,-87.666369,1,0,2703713,12092.0,707.0
4,8368157,HT464629,1314230400,049XX S HERMITAGE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,0,0,...,1872189.0,2011,1518220800,41.804878,-87.668517,3,2,2703713,12092.0,707.0


### Save dataset 2: Full chicago dataset with police data

In [24]:
# optional step to save to csv
df.to_csv('chicago_with_police_cleaned.csv', index=False)

## Dataset 3: Trimmed chicago dataset

In [25]:
csv_file = '../datasets/chicago.csv'
t1 = time.time()
df = pd.read_csv(csv_file)
t2 = time.time()
print(f'time: {t2-t1}')

time: 31.634358882904053


In [26]:
df = df[:10000]

### Drop columns

In [27]:
columns_to_drop = ['unique_key', 'case_number', 'block', 'iucr', 'domestic', 'beat', 'district', 'ward', 'community_area', 'fbi_code', 'x_coordinate', 'y_coordinate', 'year', 'updated_on', 'location']
df = df.drop(columns_to_drop, axis=1)

### Drop missing values

In [28]:
df = df.dropna()
df.isnull().sum()

date                    0
primary_type            0
description             0
location_description    0
arrest                  0
latitude                0
longitude               0
dtype: int64

### Date
Extract time of month

Extract day of week

Convert date to epoch

In [29]:
# remove time
df['date'] = df['date'].str.slice(stop=10)

# convert str to datetime object
df['date'] = pd.to_datetime(df['date'])

# get day of week
df['date_day_of_week'] = df['date'].dt.dayofweek

# get time of month
# if first cond is true take the first value from choice list
# if second cond is true take the second value from choice list
# else take default
day = df['date'].dt.day
df['time_of_month'] = np.select(condlist=[day < 10, day < 20], choicelist=[0,1], default=2)
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,date_day_of_week,time_of_month
0,2011-06-17,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.804856,-87.670945,4,1
1,2012-01-12,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.803316,-87.651955,3,1
2,2019-06-30,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.800302,-87.672052,6,2
3,2011-05-26,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.800025,-87.676911,3,2
4,2006-04-06,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.796,-87.64831,3,0


In [30]:
# convert to UNIX/EPOCH
df['date'] = (df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,date_day_of_week,time_of_month
0,1308268800,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.804856,-87.670945,4,1
1,1326326400,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.803316,-87.651955,3,1
2,1561852800,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.800302,-87.672052,6,2
3,1306368000,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.800025,-87.676911,3,2
4,1144281600,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.796,-87.64831,3,0


### Convert latitude and longitude to ratio data

In [31]:
# we should also remove lat and long after

### Save dataset 3: Trimmed chicago dataset

In [32]:
# optional step to save to csv
df.to_csv('chicago_trimmed_cleaned.csv', index=False)

## Dataset 4: Trimmed chicago dataset with police data


In [33]:
csv_file = '../datasets/chicago.csv'
t1 = time.time()
df = pd.read_csv(csv_file)
t2 = time.time()
print(f'time: {t2-t1}')

df = df[:10000]

path = '../datasets/law_enforcement_stats.xlsx'
df_police = pd.read_excel(path)
df_police

time: 29.413494110107422


Unnamed: 0,Year,Population,Total Law Enforcement,Officers,Civilians
0,2001,2895995,15066.0,13581.0,1485.0
1,2002,2881295,14932.0,13609.0,1323.0
2,2003,2866361,14777.0,13553.0,1224.0
3,2004,2848996,14325.0,13326.0,999.0
4,2005,2824584,14442.0,13267.0,1175.0
5,2006,2806391,14692.0,13624.0,1068.0
6,2007,2824434,14736.0,13671.0,1065.0
7,2008,2829304,14307.0,13359.0,948.0
8,2009,2848431,13960.0,13088.0,872.0
9,2010,2833649,13318.0,12515.0,803.0


### Find mean and add it to the missing rows
To find the average for 2013 i'm taking the mean of 2012 and 2014 instead of the overall mean. This is to get a more precise number of the specific timeframe. 

In [34]:
officerMean2013 = (df_police['Officers'].iloc[11] + df_police['Officers'].iloc[13]) / 2
civilanMean2013 = (df_police['Civilians'].iloc[11] + df_police['Civilians'].iloc[13]) / 2

officerMean2013, civilanMean2013

(11989.0, 845.0)

In [35]:
officerMean2015 = (df_police['Officers'].iloc[13] + df_police['Officers'].iloc[15]) / 2
civilanMean2015 = (df_police['Civilians'].iloc[13] + df_police['Civilians'].iloc[15]) / 2

officerMean2015, civilanMean2015

(11994.0, 1024.5)

In [36]:
df_police.loc[[12], 'Officers'] = officerMean2013
df_police.loc[[12], 'Civilians'] = civilanMean2013
df_police.loc[[14], 'Officers'] = officerMean2015
df_police.loc[[14], 'Civilians'] = civilanMean2015

In [37]:
df_police = df_police.drop(['Total Law Enforcement'], axis=1)

In [38]:
# renaming columns to match the case convetion from chicago dataset
df_police = df_police.rename(columns={"Year":"year"})
df_police = df_police.rename(columns={"Population":"population"})
df_police = df_police.rename(columns={"Officers":"officers"})
df_police = df_police.rename(columns={"Civilians":"civilians"})

### Merge the two datasets
Adding the civilians, officers and population features to the corresponding year in the chicago dataset 

In [39]:
df = pd.merge(df, df_police, on='year')
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location,population,officers,civilians
0,8117160,HT351468,2011-06-17 02:00:00+00:00,049XX S HONORE ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1164845.0,1872176.0,2011,2018-02-10 03:50:01+00:00,41.804856,-87.670945,"(41.804856393, -87.670945337)",2703713,12092.0,707.0
1,8083023,HT315648,2011-05-26 09:00:00+00:00,051XX S HOYNE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1163232.0,1870403.0,2011,2016-02-04 06:33:39+00:00,41.800025,-87.676911,"(41.800025024, -87.676910797)",2703713,12092.0,707.0
2,8292372,HT526665,2011-10-03 08:30:00+00:00,019XX W 50TH ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1164477.0,1871523.0,2011,2018-02-10 03:50:01+00:00,41.803072,-87.672313,"(41.803072258, -87.672313423)",2703713,12092.0,707.0
3,8148792,HT383629,2011-07-05 07:00:00+00:00,043XX S MARSHFIELD AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1166064.0,1875881.0,2011,2018-02-10 03:50:01+00:00,41.814997,-87.666369,"(41.814997466, -87.666369113)",2703713,12092.0,707.0
4,8368157,HT464629,2011-08-25 03:00:00+00:00,049XX S HERMITAGE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1165507.0,1872189.0,2011,2018-02-10 03:50:01+00:00,41.804878,-87.668517,"(41.804878041, -87.668517036)",2703713,12092.0,707.0


### Drop columns

In [40]:
columns_to_drop = ['unique_key', 'case_number', 'block', 'iucr', 'domestic', 'beat', 'district', 'ward', 'community_area', 'fbi_code', 'x_coordinate', 'y_coordinate', 'year', 'updated_on', 'location']
df = df.drop(columns_to_drop, axis=1)


### Drop missing values

In [41]:
df = df.dropna()
df.isnull().sum()

date                    0
primary_type            0
description             0
location_description    0
arrest                  0
latitude                0
longitude               0
population              0
officers                0
civilians               0
dtype: int64

### Date
Extract time of month

Extract day of week

Convert date to epoch

In [42]:
# remove time
df['date'] = df['date'].str.slice(stop=10)

# convert str to datetime object
df['date'] = pd.to_datetime(df['date'])

# get day of week
df['date_day_of_week'] = df['date'].dt.dayofweek

# get time of month
# if first cond is true take the first value from choice list
# if second cond is true take the second value from choice list
# else take default
day = df['date'].dt.day
df['time_of_month'] = np.select(condlist=[day < 10, day < 20], choicelist=[0,1], default=2)
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,population,officers,civilians,date_day_of_week,time_of_month
0,2011-06-17,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.804856,-87.670945,2703713,12092.0,707.0,4,1
1,2011-05-26,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.800025,-87.676911,2703713,12092.0,707.0,3,2
2,2011-10-03,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.803072,-87.672313,2703713,12092.0,707.0,0,0
3,2011-07-05,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.814997,-87.666369,2703713,12092.0,707.0,1,0
4,2011-08-25,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.804878,-87.668517,2703713,12092.0,707.0,3,2


In [43]:
# convert to UNIX/EPOCH
df['date'] = (df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,population,officers,civilians,date_day_of_week,time_of_month
0,1308268800,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.804856,-87.670945,2703713,12092.0,707.0,4,1
1,1306368000,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.800025,-87.676911,2703713,12092.0,707.0,3,2
2,1317600000,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.803072,-87.672313,2703713,12092.0,707.0,0,0
3,1309824000,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.814997,-87.666369,2703713,12092.0,707.0,1,0
4,1314230400,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,41.804878,-87.668517,2703713,12092.0,707.0,3,2


### Convert latitude and longitude to ratio data

### Save dataset 4: Trimmed chicago dataset with police data

In [44]:
df.to_csv('chicago_with_police_trimmed_cleaned.csv', index=False)

# Preparing for training

### Load datasets

In [38]:
chicago_df = pd.read_csv('chicago_cleaned.csv')
chicago_police_df = pd.read_csv('chicago_with_police_cleaned.csv')

chicago_trimmed_df = pd.read_csv('chicago_trimmed_cleaned.csv')
chicago_trimmed_police_df = pd.read_csv('chicago_with_police_trimmed_cleaned.csv')

### One hot encoding

In [40]:
def one_hot_encode(dataset, columns: list[str]):
    one_hot_encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')

    for col in columns:
        encoded_df = pd.DataFrame(one_hot_encoder.fit_transform(dataset[[col]]).toarray())
        encoded_df.columns = one_hot_encoder.get_feature_names_out([col])

        dataset = dataset.join(encoded_df)

    return dataset

In [41]:
full_columns = ['block', 'case_number', 'iucr', 'primary_type', 'description', 'location_description', 'fbi_code']
trimmed_columns = ['primary_type', 'description', 'location_description']

chicago_df = one_hot_encode(chicago_df, full_columns)
chicago_police_df = one_hot_encode(chicago_police_df, full_columns)

chicago_trimmed_df = one_hot_encode(chicago_trimmed_df, trimmed_columns)
chicago_trimmed_police_df = one_hot_encode(chicago_trimmed_police_df, trimmed_columns)


### Stratified Shuffle Split - train, test, validation

In [42]:
def stratified_shuffle_split_train_test_valid(df, category_threshold=20):
    # Creating category of combined primary type and arrested or not
    df['primary_type_arrest_cat'] = df['primary_type'] + df['arrest'].astype(str)

    # removing categories with only one entry
    df = df.groupby('primary_type_arrest_cat').filter(lambda x : len(x) > category_threshold)
    
    # resetting index as we remove rows above
    df.reset_index(inplace=True, drop=True)

    # creating shuffle from sklearn
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)

    # splitting full dataset into a training set and a test set
    for train_index, test_index in split.split(df, df['primary_type_arrest_cat']):
        train_set = df.loc[train_index]
        test_valid_set = df.loc[test_index]
        
    # creating a new shuffler
    split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

    # spliiting test set into test and validation set
    for train_index, test_index in split2.split(test_valid_set, test_valid_set['primary_type_arrest_cat']):
        test_set = df.loc[train_index]
        valid_set = df.loc[test_index]
    
    # validate the results
    full_df = df["primary_type_arrest_cat"].value_counts() / len(df)
    train_df = train_set["primary_type_arrest_cat"].value_counts() / len(train_set)
    test_df = test_set["primary_type_arrest_cat"].value_counts() / len(test_set)
    valid_df = valid_set["primary_type_arrest_cat"].value_counts() / len(valid_set)

    #print(f'Full dataset\n{full_df[:3]} \n')
    #print(f'Train dataset\n{train_df[:3]} \n')
    #print(f'Test dataset\n{test_df[:3]} \n')
    #print(f'Validation dataset\n{valid_df[:3]} \n')
    #print('***********************\n')
    
    train_set = train_set.drop(['primary_type_arrest_cat'], axis=1)
    test_set = test_set.drop(['primary_type_arrest_cat'], axis=1)
    valid_set = valid_set.drop(['primary_type_arrest_cat'], axis=1)
    
    return {'train': train_set, 'test': test_set, 'valid': valid_set}


In [43]:
chicago_sets = stratified_shuffle_split_train_test_valid(chicago_df)
chicago_police_sets = stratified_shuffle_split_train_test_valid(chicago_police_df)

chicago_trimmed_sets = stratified_shuffle_split_train_test_valid(chicago_trimmed_df)
chicago_trimmed_police_sets = stratified_shuffle_split_train_test_valid(chicago_trimmed_police_df)

### Drop unused columns

In [45]:
def drop_unused_columns(datasets, columns: list[str]):
    for dataset in datasets.values():
        dataset.drop(columns, axis=1, inplace=True)

    return datasets

In [46]:
chicago_sets = drop_unused_columns(chicago_sets, full_columns)
chicago_police_sets = drop_unused_columns(chicago_police_sets, full_columns)

chicago_trimmed_sets = drop_unused_columns(chicago_trimmed_sets, trimmed_columns)
chicago_trimmed_police_sets = drop_unused_columns(chicago_trimmed_police_sets, trimmed_columns)

In [48]:
chicago_police_sets['train'].head()

Unnamed: 0,unique_key,date,arrest,domestic,beat,district,ward,community_area,x_coordinate,y_coordinate,...,fbi_code_14,fbi_code_15,fbi_code_16,fbi_code_17,fbi_code_18,fbi_code_19,fbi_code_20,fbi_code_22,fbi_code_24,fbi_code_26
901,8427580,1325721600,0,0,831,8.0,18.0,66.0,1158271.0,1857240.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
621,8773679,1346025600,1,1,824,8.0,14.0,63.0,1156559.0,1866006.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5088,10975119,1496966400,0,0,825,8.0,15.0,66.0,1161478.0,1861938.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5617,4477745,1133913600,0,1,824,8.0,16.0,66.0,1160387.0,1865288.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4228,6537425,1223164800,0,0,915,9.0,16.0,61.0,1164189.0,1871850.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Normalize values
Using MinMaxScaler from scikitlearn. If there is too many outliers it will not perform as well because values is only between 0 and 1 

In [49]:
def scale_df(datasets):  
    for name, dataset in datasets.items():
        # create a scaler object
        scaler = preprocessing.MinMaxScaler()
        
        # fit and transform the data
        scaled = pd.DataFrame(scaler.fit_transform(dataset), columns=dataset.columns)
        datasets[name] = scaled
    
    return datasets

In [50]:
chicago_sets = scale_df(chicago_sets)
chicago_police_sets = scale_df(chicago_police_sets)

chicago_trimmed_sets = scale_df(chicago_trimmed_sets)
chicago_trimmed_police_sets = scale_df(chicago_trimmed_police_sets)

In [52]:
chicago_police_sets['train'].head()

Unnamed: 0,unique_key,date,arrest,domestic,beat,district,ward,community_area,x_coordinate,y_coordinate,...,fbi_code_14,fbi_code_15,fbi_code_16,fbi_code_17,fbi_code_18,fbi_code_19,fbi_code_20,fbi_code_22,fbi_code_24,fbi_code_26
0,0.647915,0.578109,0.0,0.0,0.679012,0.75,0.75,0.833333,0.384923,0.480147,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.680384,0.612133,1.0,1.0,0.657407,0.75,0.55,0.333333,0.333071,0.691866,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.886905,0.865064,0.0,0.0,0.660494,0.75,0.6,0.833333,0.482055,0.593614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.277373,0.256696,0.0,1.0,0.657407,0.75,0.65,0.833333,0.449011,0.674524,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.470596,0.406255,0.0,0.0,0.938272,1.0,0.65,0.0,0.564164,0.833011,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Extract feature and target

In [53]:
def extract_target(datasets):
    X = {}
    y = {}
    
    for name, dataset in datasets.items():
        X[name] = dataset.drop(['arrest'], axis=1)
        y[name] = dataset['arrest']
        
    return X, y

In [54]:
chicago_sets_X, chicago_sets_y = extract_target(chicago_sets)
chicago_police_sets_X, chicago_police_sets_y = extract_target(chicago_police_sets)

chicago_trimmed_sets_X, chicago_trimmed_sets_y = extract_target(chicago_trimmed_sets)
chicago_trimmed_police_sets_X, chicago_trimmed_police_sets_y = extract_target(chicago_trimmed_police_sets)

In [56]:
print(chicago_sets_y['train'].head())
chicago_sets_X['train'].head()

0    0.0
1    1.0
2    1.0
3    0.0
4    0.0
Name: arrest, dtype: float64


Unnamed: 0,unique_key,date,domestic,beat,district,ward,community_area,x_coordinate,y_coordinate,year,...,fbi_code_14,fbi_code_15,fbi_code_16,fbi_code_17,fbi_code_18,fbi_code_19,fbi_code_20,fbi_code_22,fbi_code_24,fbi_code_26
0,0.306092,0.264404,0.0,0.654321,0.75,0.5,0.666667,0.229922,0.632509,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.517057,0.421994,0.0,0.351852,0.5,0.65,1.0,0.642176,0.60996,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.732162,0.636675,1.0,0.679012,0.75,0.6,0.833333,0.418824,0.532497,0.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.730929,0.635359,1.0,0.660494,0.75,0.6,0.833333,0.491963,0.624261,0.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.8774,0.811892,0.0,0.348765,0.5,0.7,1.0,0.695184,0.529314,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### PCA

In [61]:
def apply_pca(datasets, variance=0.99):
    pca_dataset = {}
    pca = PCA(n_components=variance)
    pca.fit(datasets['train'])
    
    for name, dataset in datasets.items():
        df_pca = pca.transform(dataset)
        pca_dataset[name] = df_pca

    return pca_dataset

In [62]:
chicago_sets_pca_X = apply_pca(chicago_sets_X)
chicago_police_sets_pca_X = apply_pca(chicago_police_sets_X)

chicago_trimmed_sets_pca_X = apply_pca(chicago_trimmed_sets_X)
chicago_trimmed_police_sets_pca_X = apply_pca(chicago_trimmed_police_sets_X)

### Shape check

In [69]:
chicago_police_sets_X['train'].shape, chicago_police_sets_X['test'].shape, chicago_police_sets_X['valid'].shape

((5542, 11832), (1848, 11832), (1848, 11832))

In [68]:
#PCA
chicago_police_sets_pca_X['train'].shape, chicago_police_sets_pca_X['test'].shape, chicago_police_sets_pca_X['valid'].shape



((5542, 5113), (1848, 5113), (1848, 5113))

# Training - scores

In [65]:
datasets_X = [
    chicago_sets_X, 
    chicago_police_sets_X, 
    chicago_trimmed_sets_X, 
    chicago_trimmed_police_sets_X,
    
    chicago_sets_pca_X, 
    chicago_police_sets_pca_X, 
    chicago_trimmed_sets_pca_X, 
    chicago_trimmed_police_sets_pca_X
]

datasets_y = [
    chicago_sets_y, 
    chicago_police_sets_y, 
    chicago_trimmed_sets_y, 
    chicago_trimmed_police_sets_y,
]*2


classifier_names = [
    "Neural Net (100, 100), adam",
    "Neural Net (100, 100), sgd", # hidden layers
]

classifiers = [
    MLPClassifier(hidden_layer_sizes=(100, 100)),
    MLPClassifier(hidden_layer_sizes=(100, 100), solver=('sgd'))
]



In [66]:
def write_to_file(results):
    with open(f'result_{time.time()}.txt', 'w+') as file:
        file.writelines(results)

In [67]:
results = []
for dataset_X, dataset_y in zip(datasets_X, datasets_y):
    for clf, clf_name in zip(classifiers, classifier_names):
        results.append(f"** {clf_name} \n")
        print(f"** {clf_name}")
        
        t0 = time.time()
        clf.fit(dataset_X['train'], dataset_y['train'])
        t1 = time.time()
        
        results.append(f"\tTraining time:\t\t{t1-t0:3.3f}\n")
        print(f"\tTraining time:\t\t{t1-t0:3.3f}\n")

        score_train = clf.score(dataset_X['train'], dataset_y['train']) # COULD BE LIMITED TO X
        
        t2 = time.time()
        results.append(f"\tPrediction time(train):\t{t2-t1:3.3f}\n")
        print(f"\tPrediction time(train):\t{t2-t1:3.3f}\n")

        score_test = clf.score(dataset_X['test'], dataset_y['test'])
        t3 = time.time()
        results.append(f"\tPrediction time(test):\t{t3-t2:3.3f}\n")
        print(f"\tPrediction time(test):\t{t3-t2:3.3f}\n")

        results.append(f"\tScore Train: {score_train:.3f}\tScore Test: {score_test:.3f}\n")
        print(f"\tScore Train: {score_train:.3f}\tScore Test: {score_test:.3f}\n")


write_to_file(results)
print(results)

** Neural Net (100, 100), adam
	Training time:		18.489

	Prediction time(train):	0.169

	Prediction time(test):	0.076

	Score Train: 1.000	Score Test: 0.950

['** Neural Net (100, 100), adam \n', '\tTraining time:\t\t18.489\n', '\tPrediction time(train):\t0.169\n', '\tPrediction time(test):\t0.076\n', '\tScore Train: 1.000\tScore Test: 0.950\n']
