In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier


In [2]:
df = pd.read_csv("/content/NYPD_Complaint_Data_Current_(Year_To_Date)_20260110.csv")

# Drop useless columns
drop_cols = [
    "CMPLNT_NUM", "Latitude", "Longitude",
    "OFNS_DESC", "PREM_TYP_DESC",
    "JURIS_DESC", "OCCURENCE", "ADDR_PCT_CD"
]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Target
df = df.dropna(subset=["COMPLETED"])
y = df["COMPLETED"].astype(int)
X = df.drop(columns=["COMPLETED"])


KeyError: ['COMPLETED']

In [3]:
print(df.columns.tolist())


['BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'CRM_ATPT_CPTD_CD', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'PARKS_NM', 'PATROL_BORO', 'PD_CD', 'PD_DESC', 'RPT_DT', 'STATION_NAME', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'X_COORD_CD', 'Y_COORD_CD', 'Lat_Lon', 'New Georeferenced Column']


In [4]:
df["CMPLNT_FR_DT"] = pd.to_datetime(df["CMPLNT_FR_DT"], errors="coerce")
df["hour"] = pd.to_datetime(df["CMPLNT_FR_TM"], errors="coerce").dt.hour
df["weekday"] = df["CMPLNT_FR_DT"].dt.weekday
df["month"] = df["CMPLNT_FR_DT"].dt.month

crime_df = df[[
    "BORO_NM", "hour", "weekday", "month",
    "VIC_SEX", "KY_CD"
]].copy()

crime_df["unsafe"] = 1


  df["hour"] = pd.to_datetime(df["CMPLNT_FR_TM"], errors="coerce").dt.hour


In [5]:
import numpy as np

n_safe = len(crime_df)

safe_df = pd.DataFrame({
    "BORO_NM": np.random.choice(df["BORO_NM"].dropna().unique(), n_safe),
    "hour": np.random.randint(0, 24, n_safe),
    "weekday": np.random.randint(0, 7, n_safe),
    "month": np.random.randint(1, 13, n_safe),
    "VIC_SEX": np.random.choice(["M", "F"], n_safe),
})

safe_df["KY_CD"] = np.nan
safe_df["unsafe"] = 0


In [6]:
full_df = pd.concat([crime_df, safe_df], ignore_index=True)


In [7]:
X = full_df.drop(columns=["unsafe", "KY_CD"])
y = full_df["unsafe"]


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

num_cols = ["hour", "weekday", "month"]
cat_cols = ["BORO_NM", "VIC_SEX"]

prep = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [14]:
model = Pipeline([
    ("prep", prep),
    ("clf", LGBMClassifier(n_estimators=300))
])

model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 350845, number of negative: 350844
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 701689, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500001 -> initscore=0.000003
[LightGBM] [Info] Start training from score 0.000003


In [10]:
model.predict_proba(X.iloc[:1])




array([[0.53097373, 0.46902627]])

In [15]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)




Accuracy: 0.7541542443123194


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.66      0.73     87712
           1       0.71      0.85      0.78     87711

    accuracy                           0.75    175423
   macro avg       0.76      0.75      0.75    175423
weighted avg       0.76      0.75      0.75    175423



In [17]:
ASSAULT_CODES = [105, 106, 107]
assault_df = crime_df[crime_df["KY_CD"].isin(ASSAULT_CODES)]


In [18]:
X2 = assault_df.drop(columns=["KY_CD", "unsafe"])
y2 = assault_df["KY_CD"]


In [19]:
assault_model = Pipeline([
    ("prep", prep),
    ("clf", LGBMClassifier(objective="multiclass"))
])

assault_model.fit(X2, y2)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 43318, number of used features: 13
[LightGBM] [Info] Start training from score -1.332239
[LightGBM] [Info] Start training from score -0.657634
[LightGBM] [Info] Start training from score -1.523083


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

# ============================================================================
# DATA LOADING AND PREPROCESSING
# ============================================================================

print("Loading data...")
df = pd.read_csv("NYPD_Complaint_Data_Current_(Year_To_Date)_20260110.csv")

# Drop columns that won't be useful for prediction
drop_cols = [
    "CMPLNT_NUM", "Latitude", "Longitude", "OFNS_DESC", "PREM_TYP_DESC",
    "JURIS_DESC", "OCCURENCE", "ADDR_PCT_CD", "CMPLNT_TO_DT", "CMPLNT_TO_TM",
    "RPT_DT", "STATION_NAME", "PARKS_NM", "HADEVELOPT", "HOUSING_PSA",
    "TRANSIT_DISTRICT", "X_COORD_CD", "Y_COORD_CD", "Lat_Lon",
    "New Georeferenced Column", "PD_CD", "PD_DESC", "JURISDICTION_CODE",
    "PATROL_BORO", "LAW_CAT_CD", "LOC_OF_OCCUR_DESC", "CRM_ATPT_CPTD_CD"
]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

# ============================================================================
# FEATURE ENGINEERING
# ============================================================================

print("Engineering features...")

# Parse datetime features
df["CMPLNT_FR_DT"] = pd.to_datetime(df["CMPLNT_FR_DT"], errors="coerce")
df["CMPLNT_FR_TM"] = pd.to_datetime(df["CMPLNT_FR_TM"], format='%H:%M:%S', errors="coerce")

# Extract temporal features
df["hour"] = df["CMPLNT_FR_TM"].dt.hour
df["weekday"] = df["CMPLNT_FR_DT"].dt.weekday
df["month"] = df["CMPLNT_FR_DT"].dt.month
df["is_weekend"] = (df["weekday"] >= 5).astype(int)
df["is_night"] = ((df["hour"] >= 20) | (df["hour"] <= 6)).astype(int)

# Clean up categorical variables
df["BORO_NM"] = df["BORO_NM"].fillna("UNKNOWN")
df["VIC_SEX"] = df["VIC_SEX"].fillna("U")
df["SUSP_SEX"] = df["SUSP_SEX"].fillna("U")
df["VIC_AGE_GROUP"] = df["VIC_AGE_GROUP"].fillna("UNKNOWN")
df["SUSP_AGE_GROUP"] = df["SUSP_AGE_GROUP"].fillna("UNKNOWN")

# Create crime dataset (actual crimes = unsafe)
crime_df = df[[
    "BORO_NM", "hour", "weekday", "month", "is_weekend", "is_night",
    "VIC_SEX", "VIC_AGE_GROUP", "SUSP_SEX", "SUSP_AGE_GROUP", "KY_CD"
]].copy()

# Remove rows with missing critical features
crime_df = crime_df.dropna(subset=["hour", "weekday", "month", "BORO_NM"])
crime_df["unsafe"] = 1

print(f"Crime records: {len(crime_df):,}")

# ============================================================================
# CREATE BALANCED DATASET WITH SYNTHETIC SAFE LOCATIONS
# ============================================================================

print("Creating balanced dataset...")

n_safe = len(crime_df)

# Generate synthetic "safe" observations with realistic distributions
safe_df = pd.DataFrame({
    "BORO_NM": np.random.choice(crime_df["BORO_NM"].unique(), n_safe),
    "hour": np.random.randint(0, 24, n_safe),
    "weekday": np.random.randint(0, 7, n_safe),
    "month": np.random.randint(1, 13, n_safe),
    "is_weekend": np.random.randint(0, 2, n_safe),
    "is_night": np.random.randint(0, 2, n_safe),
    "VIC_SEX": np.random.choice(["M", "F", "U"], n_safe, p=[0.45, 0.45, 0.10]),
    "VIC_AGE_GROUP": np.random.choice(crime_df["VIC_AGE_GROUP"].unique(), n_safe),
    "SUSP_SEX": "U",
    "SUSP_AGE_GROUP": "UNKNOWN",
    "KY_CD": np.nan,
    "unsafe": 0
})

# Combine datasets
full_df = pd.concat([crime_df, safe_df], ignore_index=True)
print(f"Total records: {len(full_df):,} (50% unsafe, 50% safe)")

# ============================================================================
# PREPARE FEATURES AND TARGET
# ============================================================================

X = full_df.drop(columns=["unsafe", "KY_CD"])
y = full_df["unsafe"]

# Define feature types
num_cols = ["hour", "weekday", "month", "is_weekend", "is_night"]
cat_cols = ["BORO_NM", "VIC_SEX", "VIC_AGE_GROUP", "SUSP_SEX", "SUSP_AGE_GROUP"]

print(f"\nFeatures: {num_cols + cat_cols}")
print(f"Numerical: {num_cols}")
print(f"Categorical: {cat_cols}")

# ============================================================================
# CREATE PREPROCESSING PIPELINE
# ============================================================================

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
], remainder='drop')

# ============================================================================
# TRAIN-TEST SPLIT
# ============================================================================

print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {len(X_train):,}, Test size: {len(X_test):,}")

# ============================================================================
# MODEL TRAINING AND EVALUATION
# ============================================================================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "LightGBM": LGBMClassifier(n_estimators=300, random_state=42, verbose=-1)
}

results = {}

for name, clf in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print('='*60)

    # Create pipeline
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", clf)
    ])

    # Train
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results[name] = {
        "accuracy": accuracy,
        "roc_auc": roc_auc,
        "pipeline": pipeline
    }

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=["Safe", "Unsafe"]))

# ============================================================================
# COMPARE MODELS
# ============================================================================

print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)
print(f"{'Model':<25} {'Accuracy':<12} {'ROC-AUC':<12}")
print("-"*60)
for name, metrics in results.items():
    print(f"{name:<25} {metrics['accuracy']:<12.4f} {metrics['roc_auc']:<12.4f}")

# ============================================================================
# PREDICT ON NEW DATA EXAMPLE
# ============================================================================

print("\n" + "="*60)
print("EXAMPLE PREDICTIONS")
print("="*60)

best_model = results["LightGBM"]["pipeline"]

# Example scenarios
examples = pd.DataFrame([
    {
        "BORO_NM": "MANHATTAN", "hour": 23, "weekday": 5, "month": 12,
        "is_weekend": 1, "is_night": 1, "VIC_SEX": "F",
        "VIC_AGE_GROUP": "25-44", "SUSP_SEX": "U", "SUSP_AGE_GROUP": "UNKNOWN"
    },
    {
        "BORO_NM": "BROOKLYN", "hour": 14, "weekday": 2, "month": 6,
        "is_weekend": 0, "is_night": 0, "VIC_SEX": "M",
        "VIC_AGE_GROUP": "25-44", "SUSP_SEX": "U", "SUSP_AGE_GROUP": "UNKNOWN"
    }
])

predictions = best_model.predict_proba(examples)[:, 1]

for i, (idx, row) in enumerate(examples.iterrows()):
    print(f"\nScenario {i+1}:")
    print(f"  Location: {row['BORO_NM']}")
    print(f"  Time: {row['hour']}:00, {'Weekend' if row['is_weekend'] else 'Weekday'}")
    print(f"  Crime Risk: {predictions[i]*100:.1f}%")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)

Loading data...
Engineering features...
Crime records: 438,551
Creating balanced dataset...
Total records: 877,102 (50% unsafe, 50% safe)

Features: ['hour', 'weekday', 'month', 'is_weekend', 'is_night', 'BORO_NM', 'VIC_SEX', 'VIC_AGE_GROUP', 'SUSP_SEX', 'SUSP_AGE_GROUP']
Numerical: ['hour', 'weekday', 'month', 'is_weekend', 'is_night']
Categorical: ['BORO_NM', 'VIC_SEX', 'VIC_AGE_GROUP', 'SUSP_SEX', 'SUSP_AGE_GROUP']

Splitting data...
Train size: 701,681, Test size: 175,421

Training Logistic Regression...

Accuracy: 0.9531
ROC-AUC: 0.9939

Classification Report:
              precision    recall  f1-score   support

        Safe       0.96      0.95      0.95     87711
      Unsafe       0.95      0.96      0.95     87710

    accuracy                           0.95    175421
   macro avg       0.95      0.95      0.95    175421
weighted avg       0.95      0.95      0.95    175421


Training Random Forest...

Accuracy: 0.9783
ROC-AUC: 0.9965

Classification Report:
              pr




Accuracy: 0.9827
ROC-AUC: 0.9986

Classification Report:
              precision    recall  f1-score   support

        Safe       0.99      0.97      0.98     87711
      Unsafe       0.98      0.99      0.98     87710

    accuracy                           0.98    175421
   macro avg       0.98      0.98      0.98    175421
weighted avg       0.98      0.98      0.98    175421


MODEL COMPARISON
Model                     Accuracy     ROC-AUC     
------------------------------------------------------------
Logistic Regression       0.9531       0.9939      
Random Forest             0.9783       0.9965      
LightGBM                  0.9827       0.9986      

EXAMPLE PREDICTIONS

Scenario 1:
  Location: MANHATTAN
  Time: 23:00, Weekend
  Crime Risk: 43.4%

Scenario 2:
  Location: BROOKLYN
  Time: 14:00, Weekday
  Crime Risk: 95.5%

ANALYSIS COMPLETE




In [2]:
!pip install joblib




In [3]:
import joblib

# model = your trained ML model
joblib.dump(best_model, "best_lgbm.joblib")

['best_lgbm.joblib']