#........PREDICTIVE MODELING

In [14]:
#L.....LIBRARY IMPORTATON


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


In [15]:
#......READ CLEANED CSV

df=pd.read_csv("C:\\SAIT\\DATA 475\\Projt\\SQF_2012_cleaned.csv",dtype=str,low_memory=False)

In [16]:
#.....Collecting all grop of Prefix for eassy access
def cols_starting(prefix):
    return [c for c in df.columns if c.startswith(prefix)]

def exists(col):
    return col in df.columns

In [None]:
#.....Defining Targets.....DATA PREPARATION PART

# T1: any use of force (create from pf_* columns)
pf_cols = cols_starting("pf_")
df["any_force"] = (df[pf_cols].fillna(0).astype(float).sum(axis=1) > 0).astype(int) if pf_cols else np.nan

# T2: weapon_found
has_weapon = exists("weapon_found")

#T3: handcuff used
has_hcuff = exists("pf_hcuff")


In [None]:
#....CREATING PREDICTOR(X) FEATURES AND TARGET(Y) VARIABLES....DESCRIBE FINAL DATASET USED

num_cols  = [c for c in ["age","precinct","hour"] if exists(c)]
cat_cols  = [c for c in ["sex","race","time_of_day","weekday"] if exists(c)]
flag_cols = cols_starting("cs_")                    # 0/1 reason flags
extra_num = [c for c in ["crimsusp_encoded"] if exists(c)]

X_base = num_cols + cat_cols + flag_cols + extra_num



In [None]:
def run_models_for_target(target_col, drop_from_X_prefixes=None, positive_label_name=None):
    print("\n" + "-"*80)
    print(f"TARGET: {target_col}")
    print("-"*80)

    if not exists(target_col):
        print(f"!! Skipping: '{target_col}' not found.")
        return

    # ---- Build X ( drop any leaking prefixes) ----REMOVING VARIABLES NOT NEEDED FOR THE ANALYSIS
    X_cols = X_base.copy()
    for pre_drop in (drop_from_X_prefixes or []):
        X_cols = [c for c in X_cols if not c.startswith(pre_drop)]

    # ---- Keep rows with the target; then fill NaNs in predictors ----
    data = df[X_cols + [target_col]].dropna(subset=[target_col]).copy()
    data = data.fillna({
        c: (data[c].median() if data[c].dtype != 'object'
            else data[c].mode().iloc[0])
        for c in data.columns if data[c].isna().any()
    })

    X = data[X_cols]
    y = data[target_col].astype(int)

    # ---- Preprocess: scale numerics, one-hot categoricals, passthrough flags ----
    num_cols = [c for c in X_cols if c in (['age','precinct','hour'] + [c for c in df.columns if c.endswith('_num')])]
    cat_cols = [c for c in X_cols if c in ['sex','race','time_of_day','weekday']]
    flag_cols = [c for c in X_cols if c.startswith('cs_')] # already 0/1

    pre = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), [c for c in X_cols if c in num_cols]),
            ("cat", OneHotEncoder(handle_unknown="ignore"), [c for c in X_cols if c in cat_cols]),
            ("pas", "passthrough", [c for c in X_cols if c in flag_cols]),
        ],
        remainder="drop"
    )

    # ---- Models ----THREE(3) DIFFERENT MODELS CREATED
    models = {
        "LogReg": LogisticRegression(max_iter=200, n_jobs=None),
        "RandForest": RandomForestClassifier(n_estimators=300, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=15),
    }

    # ---- Train/test split ----
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

    # ---- Fit & evaluate ----
    results = []
    for name, clf in models.items():
        pipe = Pipeline(steps=[("pre", pre), ("clf", clf)])
        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_te)[:, 1] if hasattr(pipe.named_steps["clf"], "predict_proba") else None
        pred = pipe.predict(X_te)

##....ASSESSING MODEL PERFORMANCES

        acc = accuracy_score(y_te, pred)
        prec = precision_score(y_te, pred, zero_division=0)
        rec = recall_score(y_te, pred, zero_division=0)
        f1 = f1_score(y_te, pred, zero_division=0)
        auc = roc_auc_score(y_te, proba) if proba is not None else np.nan
        results.append((name, acc, prec, rec, f1, auc))

        print(f"\n– {name} –")
        print(f"Accuracy={acc:.3f} Precision={prec:.3f} Recall={rec:.3f} F1={f1:.3f} ROC-AUC={auc:.3f}")
        print("Confusion matrix:\n", confusion_matrix(y_te, pred))
        print(classification_report(y_te, pred, digits=3))

    res_df = pd.DataFrame(results, columns=["Model","Accuracy","Precision","Recall","F1","ROC_AUC"])
    pd.set_option("display.max_rows",20)
    print("\nSummary:\n", res_df.sort_values("F1", ascending=False))
    return res_df

In [37]:
#...Calling the function

res_force = run_models_for_target(
    target_col="any_force",
    drop_from_X_prefixes=["pf_"],
    positive_label_name="force used"
)
print(res_force)

if "weapon_found" in df.columns:
    res_weapon = run_models_for_target(
        target_col="weapon_found",
        drop_from_X_prefixes=None,
        positive_label_name="weapon found"
    )
    print(res_weapon)

if "pf_hcuff" in df.columns:
    res_hcuff = run_models_for_target(
        target_col="pf_hcuff",
        drop_from_X_prefixes=["pf_"],
        positive_label_name="handcuff used"
    )
    print(res_hcuff)


--------------------------------------------------------------------------------
TARGET: any_force
--------------------------------------------------------------------------------

– LogReg –
Accuracy=0.827 Precision=0.522 Recall=0.004 F1=0.007 ROC-AUC=0.633
Confusion matrix:
 [[110134     76]
 [ 22935     83]]
              precision    recall  f1-score   support

           0      0.828     0.999     0.905    110210
           1      0.522     0.004     0.007     23018

    accuracy                          0.827    133228
   macro avg      0.675     0.501     0.456    133228
weighted avg      0.775     0.827     0.750    133228


– RandForest –
Accuracy=0.829 Precision=0.515 Recall=0.197 F1=0.285 ROC-AUC=0.725
Confusion matrix:
 [[105949   4261]
 [ 18492   4526]]
              precision    recall  f1-score   support

           0      0.851     0.961     0.903    110210
           1      0.515     0.197     0.285     23018

    accuracy                          0.829    133228
   m