In [9]:
import pandas as pd, joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [15]:
DATA_PATH = Path("../data/processed/cityx_clean.parquet")

In [17]:
DATA = pd.read_parquet(DATA_PATH)

In [19]:
print(DATA.columns)

Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'Latitude (Y)', 'Longitude (X)', 'Hour',
       'Month', 'Year', 'DayOfWeek_num', 'DayOfMonth', 'WeekOfYear',
       'TimeOfDay', 'Season', 'AddressType'],
      dtype='object')


In [23]:

MODEL_DIR = Path("../models"); MODEL_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = MODEL_DIR / "classifier.pkl"

#df = pd.read_parquet(DATA)
df = df.dropna(subset=['Descript','Category'])
X, y = df['Descript'], df['Category']

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=30000, ngram_range=(1,2), min_df=3, max_df=0.9,
        stop_words='english'
    )),
    ("clf", LogisticRegression(max_iter=200))
])

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipe.fit(Xtr, ytr)
print(classification_report(yte, pipe.predict(Xte))[:1500])

joblib.dump(pipe, MODEL_PATH)
MODEL_PATH


                        precision    recall  f1-score   support

                 ARSON       1.00      1.00      1.00       277
            BAD CHECKS       1.00      1.00      1.00        73
               BRIBERY       1.00      0.96      0.98        52
              BURGLARY       1.00      1.00      1.00      6581
    DISORDERLY CONDUCT       1.00      1.00      1.00       778
         DRUG/NARCOTIC       1.00      1.00      1.00      7093
          EMBEZZLEMENT       1.00      1.00      1.00       206
             EXTORTION       1.00      1.00      1.00        45
FORGERY/COUNTERFEITING       1.00      1.00      1.00      1755
                 FRAUD       1.00      1.00      1.00      2994
            KIDNAPPING       1.00      1.00      1.00       417
         LARCENY/THEFT       1.00      1.00      1.00     31245
        MISSING PERSON       1.00      1.00      1.00      4605
          NON-CRIMINAL       1.00      1.00      1.00     16542
        OTHER OFFENSES       1.00      

WindowsPath('../models/classifier.pkl')