In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.clean_text)

    @staticmethod
    def clean_text(text):
        if pd.isna(text):
            return ''
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
        text = re.sub(r'\d+', ' ', text)      # Remove digits
        text = ' '.join(text.split())        # Normalize whitespace
        return text.strip()

In [3]:
print("Loading data...")
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
combined_df = pd.concat([train_df, test_df], ignore_index=True)

Loading data...


In [4]:
print("Removing rare classes...")
min_instances = 5
category_counts = combined_df['category'].value_counts()
sub_category_counts = combined_df['sub_category'].value_counts()

valid_categories = category_counts[category_counts >= min_instances].index
valid_sub_categories = sub_category_counts[sub_category_counts >= min_instances].index

filtered_df = combined_df[
    (combined_df['category'].isin(valid_categories)) &
    (combined_df['sub_category'].isin(valid_sub_categories))
].copy()

print("Instances removed:", len(combined_df) - len(filtered_df))

Removing rare classes...
Instances removed: 8832


In [5]:
def create_svm_pipeline():
    # Create an SVM classifier
    svm_classifier = SVC(probability=True, random_state=42) # probability=True for compatibility with MultiOutputClassifier if needed, though not strictly necessary for SVC itself

    pipeline = Pipeline([
        ('cleaner', TextCleaner()),  # Custom text cleaning step
        ('tfidf', TfidfVectorizer()),  # TF-IDF vectorization
        ('clf', MultiOutputClassifier(svm_classifier))  # Multi-output SVM classifier
    ])

    return pipeline

In [6]:
X = filtered_df['crimeaditionalinfo']
y = filtered_df[['category', 'sub_category']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("Training the SVM model...")
pipeline = create_svm_pipeline()
pipeline.fit(X_train, y_train)

Training the SVM model...


In [None]:
print("Predicting...")
y_pred = pipeline.predict(X_test)

In [None]:
y_pred_category, y_pred_sub_category = y_pred[:, 0], y_pred[:, 1]
y_test_category, y_test_sub_category = y_test['category'], y_test['sub_category']

In [None]:
def calculate_metrics(y_true, y_pred, label):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "Recall": recall_score(y_true, y_pred, average="macro", zero_division=0),
        "F1 Score": f1_score(y_true, y_pred, average="macro", zero_division=0),
    }

In [None]:
category_metrics = calculate_metrics(y_test_category, y_pred_category, "Category")
sub_category_metrics = calculate_metrics(y_test_sub_category, y_pred_sub_category, "Sub-category")

In [None]:
print("\nCategory Metrics:")
for metric, value in category_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
print("\nSub-category Metrics:")
for metric, value in sub_category_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
print("\nCategory Classification Report:")
print(classification_report(y_test_category, y_pred_category, zero_division=0))

In [None]:
print("\nSub-category Classification Report:")
print(classification_report(y_test_sub_category, y_pred_sub_category, zero_division=0))

In [None]:
def plot_confusion_matrix(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()

In [None]:
print("Plotting confusion matrices...")
plot_confusion_matrix(y_test_category, y_pred_category, labels=y['category'].unique(), title="Confusion Matrix for 'Category'")
plot_confusion_matrix(y_test_sub_category, y_pred_sub_category, labels=y['sub_category'].unique(), title="Confusion Matrix for 'Sub-category'")

In [None]:
import time
import numpy as np

print("=== PERFORMANCE METRICS ===")

sample_text = X_test.iloc[0:1]
start_time = time.time()
_ = pipeline.predict(sample_text)
latency = (time.time() - start_time) * 1000  # Convert to milliseconds
print(f"Latency: {latency:.2f} ms")

batch_sizes = [1, 10, 100, 1000]
speeds = []

for batch_size in batch_sizes:
    if batch_size <= len(X_test):
        batch = X_test.iloc[:batch_size]
        start_time = time.time()
        _ = pipeline.predict(batch)
        elapsed = time.time() - start_time
        speed = batch_size / elapsed
        speeds.append(speed)
        print(f"Batch size {batch_size}: {speed:.1f} predictions/sec")

print(f"\nScalability Analysis:")
print(f"Max throughput: {max(speeds):.1f} predictions/sec")
print(f"Throughput efficiency: {(max(speeds)/speeds[0]):.1f}x improvement with batching")
