In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
data_path = "data/Chicago_Crimes_2012_to_2017.csv"

In [3]:
df = pd.read_csv(
    data_path,
    low_memory=False,
)

In [4]:
df['Date'] = pd.to_datetime(
    df['Date'].str.strip(),              # buang spasi
    format='%m/%d/%Y %I:%M:%S %p',       # contoh: 05/03/2016 11:40:00 PM
    errors='coerce'
)

In [5]:
before = len(df)
df = df[~df['Date'].isna()]
after = len(df)
print(f"Drop baris dengan Date NaT: {before - after} baris dihapus")

Drop baris dengan Date NaT: 0 baris dihapus


In [6]:
df = df[(df['Date'].dt.year >= 2012) & (df['Date'].dt.year <= 2017)]
print("Jumlah data setelah filter tahun 2012-2017:", len(df))

Jumlah data setelah filter tahun 2012-2017: 1456714


# Preprocessing

### Basic Cleaning Global

In [7]:
cols_to_drop = [
    'Unnamed: 0',   # index dummy dari CSV, tidak informatif
    'ID',           # unique identifier record, tidak punya pola ke Arrest
    'Case Number',  # unique identifier incident, sama seperti ID buat model

    # Kode yang duplikatif dengan deskripsi yang lebih enak dibaca:
    'IUCR',         # sudah diwakili Primary Type + Description
    'FBI Code',     # kode klasifikasi lain untuk jenis kejahatan, mirip IUCR

    # Koordinat redundant + versi string:
    'X Coordinate', # duplikat lokasi dalam sistem koordinat lain (punya lat/long)
    'Y Coordinate',
    'Location',     # string gabungan lat,long (redundant dengan Latitude/Longitude)

    # Informasi administratif, bukan tentang kejadian:
    'Updated On',   # waktu record diupdate, bukan waktu kejadian
]

In [8]:
existing_drop = [c for c in cols_to_drop if c in df.columns]
print("Kolom yang di-drop:", existing_drop)

Kolom yang di-drop: ['Unnamed: 0', 'ID', 'Case Number', 'IUCR', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Location', 'Updated On']


In [9]:
df = df.drop(columns=existing_drop)

In [10]:
print("Kolom setelah drop:")
print(df.columns.tolist())

Kolom setelah drop:
['Date', 'Block', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'Year', 'Latitude', 'Longitude']


In [11]:
print("Jumlah sebelum drop duplicates:", len(df))

Jumlah sebelum drop duplicates: 1456714


In [12]:
df = df.drop_duplicates()

In [13]:
print("Jumlah setelah drop duplicates:", len(df))

Jumlah setelah drop duplicates: 1454314


In [14]:
if 'Date' in df.columns:
    df = df[(df['Date'].dt.year >= 2012) & (df['Date'].dt.year <= 2017)]
    print("Jumlah data setelah filter tahun 2012-2017:", len(df))

Jumlah data setelah filter tahun 2012-2017: 1454314


In [15]:
if 'Arrest' in df.columns:
    before = len(df)
    df = df[~df['Arrest'].isnull()]
    after = len(df)
    print(f"Drop baris tanpa Arrest: {before-after} baris dihapus")
else:
    raise ValueError("Kolom 'Arrest' tidak ditemukan!")

Drop baris tanpa Arrest: 0 baris dihapus


### Feature Engineering (Base)

In [16]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [17]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour
df['DayOfWeek'] = df['Date'].dt.dayofweek

In [18]:
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)   # 1 kalau Sabtu/Minggu
df['IsNight'] = df['Hour'].isin(list(range(0,6)) + list(range(22,24))).astype(int)

In [19]:
df = df.drop(columns=['Date'])

In [32]:
geo_cat_cols = ['Beat', 'District', 'Ward', 'Community Area']

for col in geo_cat_cols:
    if col in df.columns:
        df[col] = df[col].astype('Int64').astype('string')

In [33]:
if 'Domestic' in df.columns:
    # Kalau boolean -> int 0/1
    if df['Domestic'].dtype == 'bool':
        df['Domestic'] = df['Domestic'].astype(int)

### Definisikan Fitur (X) dan Target (y)

In [34]:
y = df['Arrest']
X = df.drop(columns=['Arrest'])

In [35]:
print("Shape X:", X.shape)
print("Shape y:", y.shape)

Shape X: (1454314, 18)
Shape y: (1454314,)


In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [48]:
print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

print("\nProporsi Arrest di y_train:")
print((y_train.value_counts(normalize=True) * 100).round(2))

print("\nProporsi Arrest di y_test:")
print((y_test.value_counts(normalize=True) * 100).round(2))

Train size: 1163451
Test size: 290863

Proporsi Arrest di y_train:
Arrest
False    74.09
True     25.91
Name: proportion, dtype: float64

Proporsi Arrest di y_test:
Arrest
False    74.09
True     25.91
Name: proportion, dtype: float64


### Identifikasi Kolom Numerik & Kategorikal

In [40]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [41]:
print("Fitur numerik:")
print(numeric_features)

print("\nFitur kategorikal:")
print(categorical_features)

Fitur numerik:
['Domestic', 'Latitude', 'Longitude', 'IsWeekend', 'IsNight']

Fitur kategorikal:
['Block', 'Primary Type', 'Description', 'Location Description']


In [42]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [43]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [44]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

### Fit Preprocessor di Train, Transform Train & Test

In [45]:
preprocessor.fit(X_train)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [46]:
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [47]:
print("Shape X_train_preprocessed:", X_train_preprocessed.shape)
print("Shape X_test_preprocessed:", X_test_preprocessed.shape)

Shape X_train_preprocessed: (1163451, 32750)
Shape X_test_preprocessed: (290863, 32750)


### Simpan artefak

In [51]:
joblib.dump((X_train, X_test, y_train, y_test), "data/train_test_split.joblib")
joblib.dump(preprocessor, "models/preprocessor.joblib")

['models/preprocessor.joblib']