# Step 1: Imports

In [12]:
# this are imports nothing special
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report
)
from xgboost import XGBClassifier  # pip3 install xgboost  and  brew install libomp
import warnings
warnings.filterwarnings("ignore")


# Step 2: Loading and Inspecting the Data

In [3]:
# filename = '../data/Crimes_-_2015_20260102.csv' # this is mock until andria gives me cleaned one
filename='../data/crime_data_clean.csv'  # this is cleaned data from andria
df = pd.read_csv(filename)
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Latitude,Longitude,Location,Hour,Day_of_Week,Month,Is_Weekend,Season,Area_Hour_Crime_Count,Area_Season_Crime_Count
0,10365064,HZ100370,2015-12-31 23:59:00,075XX S EMERALD AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,41.757367,-87.642993,POINT (-87.642992854 41.757366519),23,3,12,0,Winter,351,1416
1,10364662,HZ100006,2015-12-31 23:55:00,079XX S STONY ISLAND AVE,430,BATTERY,AGGRAVATED: OTHER DANG WEAPON,STREET,False,False,...,41.75127,-87.585822,POINT (-87.585822373 41.751270452),23,3,12,0,Winter,53,303
2,10364740,HZ100010,2015-12-31 23:50:00,024XX W FARGO AVE,820,THEFT,$500 AND UNDER,APARTMENT,False,False,...,42.016804,-87.690709,POINT (-87.690708662 42.016804165),23,3,12,0,Winter,131,728
3,10364683,HZ100002,2015-12-31 23:50:00,037XX N CLARK ST,460,BATTERY,SIMPLE,SIDEWALK,True,False,...,41.949837,-87.658635,POINT (-87.658635101 41.949837364),23,3,12,0,Winter,255,1084
4,10366580,HZ102701,2015-12-31 23:45:00,050XX W CONCORD PL,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,...,41.91047,-87.751597,POINT (-87.751597381 41.910469677),23,3,12,0,Winter,767,3708


# Step 3: Lets see what we are working with

In [4]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256553 entries, 0 to 256552
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID                       256553 non-null  int64  
 1   Case Number              256553 non-null  object 
 2   Date                     256553 non-null  object 
 3   Block                    256553 non-null  object 
 4   IUCR                     256553 non-null  object 
 5   Primary Type             256553 non-null  object 
 6   Description              256553 non-null  object 
 7   Location Description     256553 non-null  object 
 8   Arrest                   256553 non-null  bool   
 9   Domestic                 256553 non-null  bool   
 10  Beat                     256553 non-null  int64  
 11  District                 256553 non-null  int64  
 12  Ward                     256553 non-null  float64
 13  Community Area           256553 non-null  float64
 14  FBI 

Unnamed: 0,ID,Beat,District,Ward,Community Area,X Coordinate,Y Coordinate,Year,Latitude,Longitude,Hour,Day_of_Week,Month,Is_Weekend,Area_Hour_Crime_Count,Area_Season_Crime_Count
count,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0,256553.0
mean,10124770.0,1141.814183,11.190842,22.725468,37.359255,1164751.0,1885320.0,2015.0,41.840913,-87.670959,13.301883,3.0209,6.593998,0.287356,276.76461,1460.945875
std,464776.0,692.548665,6.920132,13.769855,21.298108,15936.87,31098.69,0.0,0.085528,0.05799,6.595156,1.99425,3.34028,0.45253,205.42855,977.664621
min,21714.0,111.0,1.0,1.0,1.0,1116632.0,1813897.0,2015.0,41.64459,-87.846497,0.0,0.0,1.0,0.0,4.0,47.0
25%,10030490.0,612.0,6.0,10.0,23.0,1152583.0,1858493.0,2015.0,41.766957,-87.71512,9.0,1.0,4.0,0.0,121.0,728.0
50%,10144030.0,1022.0,10.0,23.0,32.0,1166169.0,1891042.0,2015.0,41.856728,-87.665784,14.0,3.0,7.0,0.0,236.0,1374.0
75%,10257290.0,1711.0,17.0,34.0,56.0,1176408.0,1908204.0,2015.0,41.903871,-87.627928,19.0,5.0,9.0,1.0,378.0,1981.0
max,13599860.0,2535.0,31.0,50.0,77.0,1205111.0,1951523.0,2015.0,42.022575,-87.524615,23.0,6.0,12.0,1.0,1045.0,4718.0


# Step 4: Lets start, separate features and target and split into train/test

In [5]:
# this is before we got cleaned data from andria
# target = 'Arrest' # assuming we are predicting arrest
# X = df.select_dtypes(include=['number']).drop(columns=[target], errors='ignore') # we only use numeric columns because ML models can't read text directly
# y = df[target]

# # splitting into train test. 80% train 20% test since we know this is recommended
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# after we got cleaned data from andria
# here we define our features and target variable
# documentation says we shoould use relevant features only
features = ['Primary Type', 'District', 'Ward', 'Community Area', 'Domestic']
target = 'Arrest'


df_sample = df.sample(n=100000, random_state=42) if len(df) > 100000 else df
X = df_sample[features].copy()
y = df_sample[target]

for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# converting categorical variables to dummy variables meaning one-hot encoding(categorial text to numeric)
# this makes kernel crash
# X = pd.get_dummies(X, drop_first=True)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Model 1: Logistic Regression

In [6]:
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print(classification_report(y_test, y_pred_log_reg))


              precision    recall  f1-score   support

       False       0.80      0.48      0.60     14691
        True       0.32      0.67      0.43      5309

    accuracy                           0.53     20000
   macro avg       0.56      0.57      0.52     20000
weighted avg       0.67      0.53      0.56     20000



# Model 2: Random Forest

In [8]:
rf= RandomForestClassifier(n_estimators=100, 
                                  max_depth=10, 
                                  class_weight='balanced', 
                                  random_state=42, 
                                  n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

       False       0.86      0.91      0.89     14691
        True       0.71      0.60      0.65      5309

    accuracy                           0.83     20000
   macro avg       0.78      0.75      0.77     20000
weighted avg       0.82      0.83      0.82     20000

