# Step 1: Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report
)
import warnings
warnings.filterwarnings("ignore")
# this is imports nothing special

# Step 2: Loading and Inspecting the Data

In [10]:
filename = '../data/Crimes_-_2015_20260102.csv' # this is mock until andria gives me cleaned one
df = pd.read_csv(filename)
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,10460641,HZ199559,12/31/2015 11:59:00 PM,015XX N KEDZIE AVE,890,THEFT,FROM BUILDING,RESIDENCE PORCH/HALLWAY,False,False,...,26.0,23.0,06,,,2015,2018 Feb 09 03:44:29 PM,,,
1,10365064,HZ100370,12/31/2015 11:59:00 PM,075XX S EMERALD AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,17.0,68.0,14,1172605.0,1854931.0,2015,2018 Feb 10 03:50:01 PM,41.757367,-87.642993,POINT (-87.642992854 41.757366519)
2,10364662,HZ100006,12/31/2015 11:55:00 PM,079XX S STONY ISLAND AVE,430,BATTERY,AGGRAVATED: OTHER DANG WEAPON,STREET,False,False,...,8.0,45.0,04B,1188223.0,1852840.0,2015,2018 Feb 10 03:50:01 PM,41.75127,-87.585822,POINT (-87.585822373 41.751270452)
3,10364740,HZ100010,12/31/2015 11:50:00 PM,024XX W FARGO AVE,820,THEFT,$500 AND UNDER,APARTMENT,False,False,...,50.0,2.0,06,1158878.0,1949369.0,2015,2018 Feb 10 03:50:01 PM,42.016804,-87.690709,POINT (-87.690708662 42.016804165)
4,10364683,HZ100002,12/31/2015 11:50:00 PM,037XX N CLARK ST,460,BATTERY,SIMPLE,SIDEWALK,True,False,...,44.0,6.0,08B,1167786.0,1925033.0,2015,2018 Feb 10 03:50:01 PM,41.949837,-87.658635,POINT (-87.658635101 41.949837364)


# Step 3: Lets see what we are working with

In [13]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264882 entries, 0 to 264881
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   ID                    264882 non-null  int64  
 1   Case Number           264882 non-null  object 
 2   Date                  264882 non-null  object 
 3   Block                 264882 non-null  object 
 4   IUCR                  264882 non-null  object 
 5   Primary Type          264882 non-null  object 
 6   Description           264882 non-null  object 
 7   Location Description  264269 non-null  object 
 8   Arrest                264882 non-null  bool   
 9   Domestic              264882 non-null  bool   
 10  Beat                  264882 non-null  int64  
 11  District              264882 non-null  int64  
 12  Ward                  264880 non-null  float64
 13  Community Area        264869 non-null  float64
 14  FBI Code              264882 non-null  object 
 15  

Unnamed: 0,ID,Beat,District,Ward,Community Area,X Coordinate,Y Coordinate,Year,Latitude,Longitude
count,264882.0,264882.0,264882.0,264880.0,264869.0,257908.0,257908.0,264882.0,257908.0,257908.0
mean,10140240.0,1144.086476,11.212049,22.814055,37.575352,1164457.0,1885559.0,2015.0,41.841573,-87.672034
std,481429.4,690.799327,6.90176,13.781026,21.419613,16466.91,31208.71,0.0,0.085836,0.059937
min,21714.0,111.0,1.0,1.0,1.0,1094231.0,1813897.0,2015.0,41.64459,-87.928909
25%,10033850.0,612.0,6.0,10.0,23.0,1152408.0,1858594.0,2015.0,41.76733,-87.715914
50%,10148760.0,1023.0,10.0,23.0,32.0,1166064.0,1891470.0,2015.0,41.857848,-87.666011
75%,10263250.0,1654.0,16.0,34.0,57.0,1176389.0,1908452.0,2015.0,41.904503,-87.627995
max,14057950.0,2535.0,31.0,50.0,77.0,1205111.0,1951523.0,2015.0,42.022575,-87.524615


# Step 4: Lets start, separate features and target and split into train/test

In [15]:
target = 'Arrest' # assuming we are predicting arrest
X = df.select_dtypes(include=['number']).drop(columns=[target], errors='ignore') # we only use numeric columns because ML models can't read text directly
y = df[target]

# splitting into train test. 80% train 20% test since we know this is recommended
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 5: Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train) # data is so big it takes 29 seconds

# lets predict
y_pred = rf.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)

# lets see report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.79      0.89      0.83     38846
        True       0.52      0.34      0.41     14131

    accuracy                           0.74     52977
   macro avg       0.66      0.61      0.62     52977
weighted avg       0.72      0.74      0.72     52977



In [None]:
# balances makes true more important
rf2 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf2.fit(X_train, y_train)

# checking again
y_pred = rf2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.79      0.89      0.84     38846
        True       0.53      0.33      0.41     14131

    accuracy                           0.74     52977
   macro avg       0.66      0.61      0.62     52977
weighted avg       0.72      0.74      0.72     52977

