# Crime Patterns in Chicago
*Examining the Relationship Between Daytime and Nighttime Crime Rates*

## Setup

### Imports

In [None]:
!pip install astral

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
# from google.colab import drive
from pyproj import Transformer
from sklearn.preprocessing import LabelEncoder

from astral import LocationInfo
from astral.sun import sun

from scipy.stats import pearsonr
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

### Load Data

In [None]:
drive.mount('/content/drive')

# NOTE: To get this working, right click the 'In Data We Trust' folder in
#   Google Drive, then add a shortcut. This will then work automatically
#   without having to change the directory.
proj_dir = '/content/drive/MyDrive/CS326 - In Data We Trust'

# This should print the files in the project folder.
!ls "$proj_dir"

In [2]:
# Fix columns names
df = pd.read_csv("crimes_data_chicago.csv")
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)\
                      .str.strip().str.lower()\
                      .str.replace(' ', '_').str.replace('#', '')
df.columns

FileNotFoundError: [Errno 2] No such file or directory: 'crimes_data_chicago.csv'

In [None]:
df.head()

## Data Cleaning

**NOTE:** PROVIDE REASONING FOR THIS

In [None]:
# Drop additional columns

columns_to_drop = ['case', 'x_coordinate', 'y_coordinate', 'location', 'iucr']
df = df.drop(columns_to_drop, axis=1)

Quick analysis of what percentage of rows contain NA location

In [None]:
prev_num_rows = len(df.index)

# There are 93 rows where NaN values are in latitude and longitude
df = df.dropna(subset=['longitude', 'latitude'])

# We only have 1 row that has NaN value that is NOT latitude or longitude
# @ index 230265 for Ahmed (Hamood)
#   Drop unnecessary columns for modeling/correlation matrix right before preprocessing step
#   Save this as a separate dataframe!!!
df = df.dropna(subset=['ward'])

curr_num_rows = len(df.index)

print(curr_num_rows/prev_num_rows*100)

Grouping of less frequent categories under OTHER

In [None]:
threshold = 0.01

# Display values for location_description, primary_description, secondary_description, fbi_cd
categorical_cols_to_aggr = [ "location_description", "primary_description",  "secondary_description", "fbi_cd"]
for col in categorical_cols_to_aggr:
    uniques = df[col].value_counts()
    # print(f"Unique values in {col}: {len(uniques)}")

    counts = df[col].value_counts(normalize=True)
    to_keep = counts[counts > threshold].index

    df.loc[:, col] = df[col].where(df[col].isin(to_keep), "OTHER")

    display(df[[col]].value_counts())


Creating day/time binary column

In [None]:
df["date_of_occurrence"] = pd.to_datetime(df["date_of_occurrence"])
city = LocationInfo("Chicago", "USA", "America/Chicago", 41.8781, -87.6298)

def is_daytime(ts):
  # Check if the timestamp is NaT before localizing
  if pd.isna(ts):
    return 0 # Or handle missing timestamps as appropriate for your analysis

  # Localize the timestamp to the city's timezone, handling ambiguous times by setting them to NaT
  ts_localized = ts.tz_localize(city.timezone, ambiguous='NaT')

  # Check if localization resulted in NaT (due to ambiguity or original NaT)
  if pd.isna(ts_localized):
      return 0 # Or handle as appropriate

  # Get sunrise and sunset for the date of the localized timestamp
  s = sun(city.observer, date=ts_localized.date(), tzinfo=city.timezone)

  # Check if sunrise or sunset is NaT
  if pd.isna(s["sunrise"]) or pd.isna(s["sunset"]):
      return 0 # Or handle as appropriate

  return int(s["sunrise"] <= ts_localized <= s["sunset"])

# Apply the function to the date_of_occurrence column
df["is_daytime"] = df["date_of_occurrence"].apply(is_daytime)

In [None]:
df.head(3)

In [None]:
df.loc[:, 'arrest'] = df['arrest'].map({'Y': True, 'N': False})
df.loc[:, 'domestic'] = df['domestic'].map({'Y': True, 'N': False})

## Exploratory Data Analysis

In [None]:
# @title Function for displaying correlation
def show_day_night_correlations(df, target_col, loc_col, daytime_col='is_daytime'):
    # 1. Ensure target is numeric (0 or 1) for the entire operation
    # We create a copy so we don't modify your original dataframe
    work_df = df.copy()
    work_df[target_col] = work_df[target_col].astype(int)

    # 2. TABLE: Calculate correlations split by Day/Night
    results = {}
    locations = work_df[loc_col].unique()

    for loc in locations:
        loc_binary = (work_df[loc_col] == loc).astype(int)

        # Day Stats
        mask_day = work_df[daytime_col] == True
        if mask_day.sum() > 0:
            r_day, p_day = pearsonr(loc_binary[mask_day], work_df.loc[mask_day, target_col])
        else:
            r_day, p_day = 0, 1.0

        # Night Stats
        mask_night = work_df[daytime_col] == False
        if mask_night.sum() > 0:
            r_night, p_night = pearsonr(loc_binary[mask_night], work_df.loc[mask_night, target_col])
        else:
            r_night, p_night = 0, 1.0

        results[loc] = {
            "Corr_Day": r_day, "P_Day": p_day,
            "Corr_Night": r_night, "P_Night": p_night,
            "Diff (Day-Night)": r_day - r_night
        }

    results_df = pd.DataFrame(results).T.sort_values("Diff (Day-Night)", ascending=False)

    def style_sig(val):
        return 'color: red' if val >= 0.01 else 'color: green'

    display(results_df.style.map(style_sig, subset=['P_Day', 'P_Night'])
            .format("{:.3f}")
            .background_gradient(subset=['Diff (Day-Night)'], cmap='coolwarm'))

    # 3. VISUALIZATION: Heatmap
    # Now using 'work_df' where target_col is guaranteed to be numeric
    pivot_df = work_df.pivot_table(
        index=loc_col,
        columns=daytime_col,
        values=target_col,
        aggfunc='mean'
    )

    pivot_df.columns = [f'Night ({target_col} rate)', f'Day ({target_col} rate)']

    plt.figure(figsize=(8, len(pivot_df) * 0.4 + 2))
    sns.heatmap(pivot_df, annot=True, cmap="Reds", fmt=".1%", cbar_kws={'label': 'Probability'})
    plt.title(f"Impact of {daytime_col} on {target_col} by Location")
    plt.ylabel("Location")
    plt.show()

In [None]:
show_day_night_correlations(df, 'arrest', 'location_description')

## Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, HistGradientBoostingClassifier


### Prepare training and helper functions

One-hot and feature hash categorical columns,
EXCEPT `block` because it has over 28000 unique values and can be represented using `lat` and `lon`.

In [None]:
# Time-based features from date_of_occurrence
# df['hour'] = df['date_of_occurrence'].dt.hour
df['month'] = df['date_of_occurrence'].dt.month
df['weekday'] = df['date_of_occurrence'].dt.weekday
df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)

# Preprocessing with One-Hot Encoding and Feature Hashing
ohe_cols = ['primary_description', 'secondary_description', 'location_description', 'fbi_cd']
hash_cols = ['beat', 'ward']

# One-hot encoding
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
ohe_encoded = ohe.fit_transform(df[ohe_cols])
ohe_df = pd.DataFrame(ohe_encoded, columns=ohe.get_feature_names_out(ohe_cols))

# Feature hashing
beat_str = df['beat'].fillna(-1).astype(int).astype(str)
ward_str = df['ward'].fillna(-1).astype(int).astype(str)

hasher_beat = FeatureHasher(n_features=128, input_type='string')
beat_hashed = hasher_beat.transform([[s] for s in beat_str])
beat_df = pd.DataFrame(
    beat_hashed.toarray(),
    columns=[f'beat_hash_{i}' for i in range(128)]
)

hasher_ward = FeatureHasher(n_features=32, input_type='string')
ward_hashed = hasher_ward.transform([[s] for s in ward_str])
ward_df = pd.DataFrame(
    ward_hashed.toarray(),
    columns=[f'ward_hash_{i}' for i in range(32)]
)

df_encoded = pd.concat([
    df.drop(columns=ohe_cols + hash_cols),
    ohe_df,
    beat_df,
    ward_df
], axis=1)

df_encoded = df_encoded.dropna(axis=0).reset_index(drop=True)

# Train/test split
y = df_encoded['is_daytime']
# keep new time features, drop raw datetime to avoid leaking the full timestamp
X = df_encoded.drop(columns=['block', 'is_daytime', 'date_of_occurrence'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.02, random_state=42
)


In [None]:
def cross_validate_model(model):
    cr = cross_validate(
        model,
        X_train,
        y_train,
        scoring=['f1', 'roc_auc', 'accuracy'],
        n_jobs=-1,
        cv=5
    )

    print(f"Mean F1 Score: {cr['test_f1'].mean()}")
    print(f"Mean ROC AUC: {cr['test_roc_auc'].mean()}")
    print(f"Mean Accuracy: {cr['test_accuracy'].mean()}")

    return cr

### KNN

In [None]:
knn = cross_validate_model(KNeighborsClassifier(n_neighbors=3))

In [None]:
pipeline_knn = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=3)
)

knn = cross_validate_model(pipeline_knn)

In [None]:
pipeline_knn = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

param_grid = {
    'kneighborsclassifier__n_neighbors': [1, 5, 7],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(
    estimator=pipeline_knn,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_knn.fit(X_train, y_train)

print("Best params:", grid_knn.best_params_)
print("Best ROC AUC:", grid_knn.best_score_)

### Logistic Regression

In [None]:
lr = cross_validate_model(LogisticRegression(solver='saga', max_iter=2000))

In [None]:
pipeline_lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(solver='saga', max_iter=2000)
)

lr = cross_validate_model(pipeline_lr)

In [None]:
pipe_lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(solver='saga', max_iter=2000)
)

param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__class_weight': [None, 'balanced']
}

grid_lr = GridSearchCV(
    estimator=pipe_lr,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_lr.fit(X_train, y_train)

print("Best params:", grid_lr.best_params_)
print("Best ROC AUC:", grid_lr.best_score_)

In [None]:
lr = cross_validate_model(grid_lr.best_estimator_)

### Decision Tree

In [None]:
dt = cross_validate_model(DecisionTreeClassifier())

In [None]:
params_round_1 = {
"criterion": ["gini", "entropy", "log_loss"],
"max_depth": [None] + [i for i in range(1, 21)],
"max_features": np.arange(0.1, 1.0, 0.1)
}

gs_dt_round_1 = GridSearchCV(DecisionTreeClassifier(), params_round_1, scoring = "f1", cv = 3, n_jobs = -1)
gs_dt_round_1.fit(X_train, y_train)


params_round_2 = {
"criterion": [gs_dt_round_1.best_params_["criterion"]],
"max_depth": [gs_dt_round_1.best_params_["max_depth"]],
"max_features": [gs_dt_round_1.best_params_["max_features"]],
"min_samples_split": [i for i in range(2, 20)],
"min_samples_leaf": [i for i in range(1, 10)],
}

gs_dt_round_2 = GridSearchCV(DecisionTreeClassifier(), params_round_2, scoring = "f1", cv = 3, n_jobs = -1)
gs_dt_round_2.fit(X_train, y_train)

print(gs_dt_round_2.best_params_)

dt = cross_validate_model(gs_dt_round_2.best_estimator_)

In [None]:
params_round_1 = {
"criterion": ["gini", "entropy", "log_loss"],
"max_depth": [None] + [i for i in range(1, 21)],
"max_features": np.arange(0.1, 1.0, 0.1)
}

gs_dt_round_1 = GridSearchCV(DecisionTreeClassifier(), params_round_1, scoring = "roc_auc", cv = 3, n_jobs = -1)
gs_dt_round_1.fit(X_train, y_train)


params_round_2 = {
"criterion": [gs_dt_round_1.best_params_["criterion"]],
"max_depth": [gs_dt_round_1.best_params_["max_depth"]],
"max_features": [gs_dt_round_1.best_params_["max_features"]],
"min_samples_split": [i for i in range(2, 20)],
"min_samples_leaf": [i for i in range(1, 10)],
}

gs_dt_round_2 = GridSearchCV(DecisionTreeClassifier(), params_round_2, scoring = "roc_auc", cv = 3, n_jobs = -1)
gs_dt_round_2.fit(X_train, y_train)

print(gs_dt_round_2.best_params_)

dt = cross_validate_model(gs_dt_round_2.best_estimator_)

In [None]:
params_round_1 = {
"criterion": ["gini", "entropy", "log_loss"],
"max_depth": [None] + [i for i in range(1, 21)],
"max_features": np.arange(0.1, 1.0, 0.1)
}

gs_dt_round_1 = GridSearchCV(DecisionTreeClassifier(), params_round_1, scoring = "accuracy", cv = 3, n_jobs = -1)
gs_dt_round_1.fit(X_train, y_train)


params_round_2 = {
"criterion": [gs_dt_round_1.best_params_["criterion"]],
"max_depth": [gs_dt_round_1.best_params_["max_depth"]],
"max_features": [gs_dt_round_1.best_params_["max_features"]],
"min_samples_split": [i for i in range(2, 20)],
"min_samples_leaf": [i for i in range(1, 10)],
}

gs_dt_round_2 = GridSearchCV(DecisionTreeClassifier(), params_round_2, scoring = "accuracy", cv = 3, n_jobs = -1)
gs_dt_round_2.fit(X_train, y_train)

print(gs_dt_round_2.best_params_)

dt = cross_validate_model(gs_dt_round_2.best_estimator_)

### Random Forest

### Gradient Boosting

### Ensemble

In [None]:
rf_cv = cross_validate_model(RandomForestClassifier())

In [None]:
gbc_cv = cross_validate_model(GradientBoostingClassifier())

In [None]:
etc_cv = cross_validate_model(ExtraTreesClassifier())

In [None]:
abc_cv = cross_validate_model(AdaBoostClassifier())

In [None]:
hg_cv = cross_validate_model(HistGradientBoostingClassifier())

In [None]:
hg_cv = cross_validate_model(HistGradientBoostingClassifier())