In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

In [32]:
permit_types_to_keep = [
    'unincorporated building residential',
    'road',
    'unincorporated electrical',
    'unincorporated mechanical'
]
df_clean = pd.read_csv("permit_data_cleaned.csv")
df_filtered = df_clean[df_clean['PERMITTYPE'].isin(permit_types_to_keep)].copy()
df_filtered.to_csv("permit_data_filtered.csv", index=False)

In [33]:
DATA_FILE = "permit_data_filtered.csv"
TEXT_COLUMN = "DESCRIPTION"
TARGET_COLUMN = "PERMITTYPE"
TEST_SIZE = 0.2 
RANDOM_STATE = 42

In [34]:
print(f"Loading data from {DATA_FILE}...")
try:
    df = pd.read_csv(DATA_FILE)
    df[TEXT_COLUMN] = df[TEXT_COLUMN].fillna('')
    print(f"Loaded {len(df)} rows.")
    print("Target variable distribution:")
    print(df[TARGET_COLUMN].value_counts())
except FileNotFoundError:
    print(f"Error: Data file '{DATA_FILE}' not found. Please run cleaning script first.")
    exit()
except Exception as e:
    print(f"Error loading data: {e}")
    exit()

if df.empty:
    print("Error: DataFrame is empty after loading. Cannot train model.")
    exit()

Loading data from permit_data_filtered.csv...
Loaded 57979 rows.
Target variable distribution:
PERMITTYPE
road                                   23438
unincorporated building residential    15497
unincorporated electrical              11236
unincorporated mechanical               7808
Name: count, dtype: int64


In [35]:
X = df[TEXT_COLUMN]
y = df[TARGET_COLUMN]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y # stratify helps maintain class proportions
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# --- Build Model Pipeline ---
print("Building model pipeline (TF-IDF Vectorizer -> Logistic Regression)...")
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, 
                             ngram_range=(1, 2), #
                             stop_words='english')), 
    ('clf', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)) 
])

Training set size: 46383
Test set size: 11596
Building model pipeline (TF-IDF Vectorizer -> Logistic Regression)...


In [37]:
print("Training model...")
try:
    pipeline.fit(X_train, y_train)
    print("Training complete.")
except Exception as e:
    print(f"Error during model training: {e}")
    print("\nTraining target variable distribution:")
    print(y_train.value_counts())
    exit()

Training model...
Training complete.


In [38]:
print("\nEvaluating model on the test set...")
y_pred = pipeline.predict(X_test)


Evaluating model on the test set...


In [39]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
try:
    labels = sorted(list(set(y_test) | set(y_pred)))
    print(classification_report(y_test, y_pred, labels=labels, zero_division=0))
except Exception as e:
     print(f"Could not generate classification report: {e}")

Accuracy: 0.9257

Classification Report:
                                     precision    recall  f1-score   support

                               road       0.98      1.00      0.99      4688
unincorporated building residential       0.87      0.97      0.91      3099
          unincorporated electrical       0.93      0.79      0.86      2247
          unincorporated mechanical       0.87      0.82      0.84      1562

                           accuracy                           0.93     11596
                          macro avg       0.91      0.89      0.90     11596
                       weighted avg       0.93      0.93      0.92     11596



In [41]:
new_descriptions = ["voluntary seismic retrofit work is to be performed in accordance with la city standard plan"]
predictions = pipeline.predict(new_descriptions)
print(f"\nExample predictions for: {new_descriptions}")
print()
print(predictions)


Example predictions for: ['voluntary seismic retrofit work is to be performed in accordance with la city standard plan']

['unincorporated building residential']
