In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\data\processed\final_data.csv")


In [2]:
geo_features = df[["Latitude", "Longitude"]]


In [3]:
temporal_features = df[["Hour", "Month", "Is_Weekend"]]


In [4]:
from sklearn.preprocessing import StandardScaler

num_cols = [
    "Latitude", "Longitude",
    "Hour", "Month",
    "Is_Weekend", "Crime_Severity_Score"
]

X = df[num_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [5]:
import joblib
joblib.dump(scaler, r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\models\scaler.pkl")


['C:\\Users\\asmis\\OneDrive\\Desktop\\patroliq_V1\\patroliq\\models\\scaler.pkl']

In [6]:
import mlflow
from sklearn.cluster import KMeans
import joblib

with mlflow.start_run(run_name="KMeans_Geo_Final"):

    kmeans_geo = KMeans(
        n_clusters=8,
        random_state=42,
        n_init=10
    )

    df["Geo_Cluster"] = kmeans_geo.fit_predict(geo_features)

    joblib.dump(kmeans_geo, r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\models\kmeans_geo.pkl")

    mlflow.log_param("algorithm", "KMeans_Geo")
    mlflow.log_param("clusters", 8)
    mlflow.log_artifact(r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\models\kmeans_geo.pkl")

2026/02/05 19:58:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/02/05 19:58:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/02/05 19:58:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/02/05 19:58:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/02/05 19:58:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/02/05 19:58:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/02/05 19:58:06 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/02/05 19:58:06 INFO mlflow.store.db.utils: Updating database tables
2026/02/05 19:58:06 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/05 19:58:06 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/02/05 19:58:07 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2026/02/05 19:5

In [7]:
from sklearn.utils import shuffle

sample = shuffle(temporal_features, random_state=42).iloc[:100_000]

with mlflow.start_run(run_name="KMeans_Temporal_Final"):

    kmeans_time = KMeans(
        n_clusters=4,
        random_state=42,
        algorithm="elkan",
        n_init=10
    )

    kmeans_time.fit(sample)

    df["Time_Cluster"] = kmeans_time.predict(temporal_features)

    joblib.dump(kmeans_time, r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\models\kmeans_temporal.pkl")

    mlflow.log_param("algorithm", "KMeans_Temporal")
    mlflow.log_param("clusters", 4)
    mlflow.log_artifact(r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\models\kmeans_temporal.pkl")

In [8]:
from sklearn.decomposition import PCA

with mlflow.start_run(run_name="PCA_Final"):

    pca = PCA(n_components=2, random_state=42)
    pca_features = pca.fit_transform(X_scaled)

    df["PC1"] = pca_features[:, 0]
    df["PC2"] = pca_features[:, 1]

    joblib.dump(pca, r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\models\pca_model.pkl")

    mlflow.log_metric(
        "explained_variance_ratio",
        pca.explained_variance_ratio_.sum()
    )

    mlflow.log_artifact(r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\models\pca_model.pkl")


In [9]:
df.to_csv(
    r"C:\Users\asmis\OneDrive\Desktop\patroliq_V1\patroliq\data\processed\final_data.csv",
    index=False
)
