In [68]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

import time
from geopy import distance

In [69]:
csv_file = 'chicago_trimmed.csv'
path = f'{csv_file}'
df = pd.read_csv(path)
org_df = df

In [70]:
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude
0,2010-10-03 11:30:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926
1,2005-10-31 03:55:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.751061,-87.569876
2,2003-10-07 02:30:00+00:00,BATTERY,SIMPLE,APARTMENT,True,41.751215,-87.571098
3,2009-01-22 02:30:00+00:00,BATTERY,SIMPLE,"SCHOOL, PUBLIC, BUILDING",False,41.73365,-87.557845
4,2020-07-03 12:15:00+00:00,BATTERY,SIMPLE,RESIDENCE,False,41.749243,-87.549214


In [71]:
test_data = df[:10] # Just for testing
central_point = (41.8835079, -87.632065) # Town hall

# Convert 2 geographic coordinates to an angle. One point is the central_point
# while the other is the location of interest, moving clockwise from.
def angle_between(p1, p2):
    ang1 = np.arctan2(*p1[::-1])
    ang2 = np.arctan2(*p2[::-1])
    return np.rad2deg((ang1 - ang2) % (2 * np.pi))

# Convert lat and long coords.
distances = []
angles = []
for index, row in test_data.iterrows():
    # Convert lat, long to distance relative to central_point.
    dist = distance.distance((row['latitude'], row['longitude']), central_point).km
    distances.append(dist)
    
    # TEST
    # angle = angle_between((1, 0), (1, -1)) # Should return 45.
    # angle = angle_between((1, -1), (1, 0)) # Should return 315.
    angle = angle_between(central_point, (row['latitude'], row['longitude']))
    angles.append(angle)
    
test_data['distance'] = distances
test_data['angle'] = angles
print(test_data)




Test data
                        date primary_type              description  \
0  2010-10-03 11:30:00+00:00      BATTERY                   SIMPLE   
1  2005-10-31 03:55:00+00:00      BATTERY                   SIMPLE   
2  2003-10-07 02:30:00+00:00      BATTERY                   SIMPLE   
3  2009-01-22 02:30:00+00:00      BATTERY                   SIMPLE   
4  2020-07-03 12:15:00+00:00      BATTERY                   SIMPLE   
5  2003-01-27 04:20:00+00:00      BATTERY                   SIMPLE   
6  2003-06-02 03:30:00+00:00      BATTERY                   SIMPLE   
7  2015-05-18 04:00:00+00:00      BATTERY                   SIMPLE   
8  2010-08-02 04:30:00+00:00      BATTERY  DOMESTIC BATTERY SIMPLE   
9  2010-09-17 10:10:00+00:00      BATTERY  DOMESTIC BATTERY SIMPLE   

       location_description  arrest   latitude  longitude   distance     angle  
0                  SIDEWALK   False  41.745809 -87.547926  16.816309  0.051958  
1                  SIDEWALK   False  41.751061 -87.569876

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['distance'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['angle'] = angles


In [9]:
# take a sample for testing
df = df[:100010]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100010 entries, 0 to 100009
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   date                  100010 non-null  object 
 1   primary_type          100010 non-null  object 
 2   description           100010 non-null  object 
 3   location_description  100010 non-null  object 
 4   arrest                100010 non-null  bool   
 5   latitude              100010 non-null  float64
 6   longitude             100010 non-null  float64
dtypes: bool(1), float64(2), object(4)
memory usage: 4.7+ MB


In [14]:
"""
df['date'] = pd.to_datetime(df['date'], errors='coerce')

df['date_year'] = df['date'].dt.year
df['date_month'] = df['date'].dt.month
df['date_day'] = df['date'].dt.day
df['date_dayofweek'] = df['date'].dt.dayofweek
df['date_hour'] = df['date'].dt.hour
df['date_minute'] = df['date'].dt.minute

df = df.drop(['date'], axis=1)
"""

"\ndf['date'] = pd.to_datetime(df['date'], errors='coerce')\n\ndf['date_year'] = df['date'].dt.year\ndf['date_month'] = df['date'].dt.month\ndf['date_day'] = df['date'].dt.day\ndf['date_dayofweek'] = df['date'].dt.dayofweek\ndf['date_hour'] = df['date'].dt.hour\ndf['date_minute'] = df['date'].dt.minute\n\ndf = df.drop(['date'], axis=1)\n"

In [15]:
df.head()

Unnamed: 0,primary_type,description,location_description,arrest,latitude,longitude,date_year,date_month,date_day,date_dayofweek,date_hour,date_minute
0,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,2010,10,3,6,11,30
1,BATTERY,SIMPLE,SIDEWALK,False,41.751061,-87.569876,2005,10,31,0,3,55
2,BATTERY,SIMPLE,APARTMENT,True,41.751215,-87.571098,2003,10,7,1,2,30
3,BATTERY,SIMPLE,"SCHOOL, PUBLIC, BUILDING",False,41.73365,-87.557845,2009,1,22,3,2,30
4,BATTERY,SIMPLE,RESIDENCE,False,41.749243,-87.549214,2020,7,3,4,12,15


In [16]:
len(df['location_description'].unique())

163

### Label encode the non-numerical features

In [17]:
%%perl -e0
label_encoder = preprocessing.LabelEncoder()
df['primary_type'] = label_encoder.fit_transform(df['primary_type'])
df['description'] = label_encoder.fit_transform(df['description'])
df['location_description'] = label_encoder.fit_transform(df['location_description'])

Couldn't find program: 'perl'


### One hot encode the label encoded features

In [18]:
one_hot_encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')

prim_type_df = pd.DataFrame(one_hot_encoder.fit_transform(df[['primary_type']]).toarray())
prim_type_df.columns = one_hot_encoder.get_feature_names_out(['primary_type'])

description_df = pd.DataFrame(one_hot_encoder.fit_transform(df[['description']]).toarray())
description_df.columns = one_hot_encoder.get_feature_names_out(['description'])

location_description_df = pd.DataFrame(one_hot_encoder.fit_transform(df[['location_description']]).toarray())
location_description_df.columns = one_hot_encoder.get_feature_names_out(['location_description'])

df = df.join(prim_type_df)
df = df.join(description_df)
df = df.join(location_description_df)

AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names_out'

In [10]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100010 entries, 0 to 100009
Columns: 596 entries, primary_type to location_description_YARD
dtypes: bool(1), float64(586), int64(6), object(3)
memory usage: 454.1+ MB


(100010, 596)

In [11]:
df.head()

Unnamed: 0,primary_type,description,location_description,arrest,latitude,longitude,date_year,date_month,date_day,date_dayofweek,...,location_description_VACANT LOT/LAND,location_description_VEHICLE - COMMERCIAL,location_description_VEHICLE - DELIVERY TRUCK,location_description_VEHICLE - OTHER RIDE SERVICE,"location_description_VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)","location_description_VEHICLE - OTHER RIDE SHARE SERVICE (LYFT, UBER, ETC.)",location_description_VEHICLE NON-COMMERCIAL,location_description_VEHICLE-COMMERCIAL,location_description_WAREHOUSE,location_description_YARD
0,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,2010,10,3,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BATTERY,SIMPLE,SIDEWALK,False,41.751061,-87.569876,2005,10,31,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BATTERY,SIMPLE,APARTMENT,True,41.751215,-87.571098,2003,10,7,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BATTERY,SIMPLE,"SCHOOL, PUBLIC, BUILDING",False,41.73365,-87.557845,2009,1,22,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BATTERY,SIMPLE,RESIDENCE,False,41.749243,-87.549214,2020,7,3,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df = df.drop(['primary_type', 'description', 'location_description'], axis=1)
df.head()

Unnamed: 0,arrest,latitude,longitude,date_year,date_month,date_day,date_dayofweek,date_hour,date_minute,primary_type_ARSON,...,location_description_VACANT LOT/LAND,location_description_VEHICLE - COMMERCIAL,location_description_VEHICLE - DELIVERY TRUCK,location_description_VEHICLE - OTHER RIDE SERVICE,"location_description_VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)","location_description_VEHICLE - OTHER RIDE SHARE SERVICE (LYFT, UBER, ETC.)",location_description_VEHICLE NON-COMMERCIAL,location_description_VEHICLE-COMMERCIAL,location_description_WAREHOUSE,location_description_YARD
0,False,41.745809,-87.547926,2010,10,3,6,11,30,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,41.751061,-87.569876,2005,10,31,0,3,55,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,True,41.751215,-87.571098,2003,10,7,1,2,30,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,False,41.73365,-87.557845,2009,1,22,3,2,30,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,False,41.749243,-87.549214,2020,7,3,4,12,15,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cast numerical datatype to ratio data

In [13]:
# should coordinates be ratio?

### Create feature and target

In [14]:
X = df.drop(['arrest'], axis=1)
y = df['arrest'].astype(int) # convert boolean to int

In [15]:
X_train = X[:60000]
y_train = y[:60000]
X_test = X[60000:100000]
y_test = y[60000:100000]
X_val = X[100000:]
y_val = y[100000:]

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100010 entries, 0 to 100009
Columns: 592 entries, latitude to location_description_YARD
dtypes: float64(586), int64(6)
memory usage: 451.7 MB


In [19]:
classifier_names = [
    #"Random Forest (n_estimators=100, n_jobs=-1)", 
    #"Random Forest (n_estimators=500, n_jobs=-1)", 
    #"Random Forest (n_estimators=1000, n_jobs=-1)", 
    "Neural Net (100, 100), adam",
    "Neural Net (100, 100), sgd", 
    "Neural Net (100, 100, 100), adam", 
    "Neural Net (592, 296, 148), adam", 
]

classifiers = [
    #RandomForestClassifier(n_estimators=100, n_jobs=-1),
    #RandomForestClassifier(n_estimators=500, n_jobs=-1),
    #RandomForestClassifier(n_estimators=1000, n_jobs=-1),
    MLPClassifier(hidden_layer_sizes=(100, 100)),
    MLPClassifier(hidden_layer_sizes=(100, 100), solver=('sgd')),
    MLPClassifier(hidden_layer_sizes=(100, 100, 100)),
    MLPClassifier(hidden_layer_sizes=(592, 296, 148)),
]

for clf, clf_name in zip(classifiers, classifier_names):
    print(f"** {clf_name}")
    t0 = time.time()
    clf.fit(X_train, y_train)
    t1 = time.time()
    print(f"\tTraining time:\t\t{t1-t0:3.3f}")
    score_train = clf.score(X_train[0:10000], y_train[0:10000])
    t2 = time.time()
    print(f"\tPrediction time(train):\t{t2-t1:3.3f}")
    score_test = clf.score(X_test, y_test)
    t3 = time.time()
    print(f"\tPrediction time(test):\t{t3-t2:3.3f}")
    print(f"\tScore Train: {score_train:.3f}\tScore Test: {score_test:.3f}")

** Neural Net (100, 100), adam
	Training time:		261.515
	Prediction time(train):	0.105
	Prediction time(test):	0.401
	Score Train: 0.849	Score Test: 0.884
** Neural Net (100, 100), sgd
	Training time:		19.101
	Prediction time(train):	0.082
	Prediction time(test):	0.290
	Score Train: 0.729	Score Test: 0.739
** Neural Net (100, 100, 100), adam
	Training time:		157.047
	Prediction time(train):	0.141
	Prediction time(test):	0.583
	Score Train: 0.846	Score Test: 0.879
** Neural Net (592, 296, 148), adam
	Training time:		1109.105
	Prediction time(train):	0.369
	Prediction time(test):	1.396
	Score Train: 0.848	Score Test: 0.883


## Results from last run 
```
** Random Forest (n_estimators=100, n_jobs=-1)
	Training time:		8.515
	Prediction time(train):	0.175
	Prediction time(test):	0.574
	Score Train: 1.000	Score Test: 0.886
** Random Forest (n_estimators=500, n_jobs=-1)
	Training time:		39.427
	Prediction time(train):	0.667
	Prediction time(test):	2.075
	Score Train: 1.000	Score Test: 0.886
** Random Forest (n_estimators=1000, n_jobs=-1)
	Training time:		73.081
	Prediction time(train):	1.046
	Prediction time(test):	3.565
	Score Train: 1.000	Score Test: 0.886
** Neural Net (100, 100), adam
	Training time:		261.515
	Prediction time(train):	0.105
	Prediction time(test):	0.401
	Score Train: 0.849	Score Test: 0.884
** Neural Net (100, 100), sgd
	Training time:		19.101
	Prediction time(train):	0.082
	Prediction time(test):	0.290
	Score Train: 0.729	Score Test: 0.739
** Neural Net (100, 100, 100), adam
	Training time:		157.047
	Prediction time(train):	0.141
	Prediction time(test):	0.583
	Score Train: 0.846	Score Test: 0.879
** Neural Net (592, 296, 148), adam
	Training time:		1109.105
	Prediction time(train):	0.369
	Prediction time(test):	1.396
	Score Train: 0.848	Score Test: 0.883
    ```

In [18]:
%%perl -e0
model.predict(X_val)
org_df.iloc[100004]