# 311 Service Request Analyzer (Toronto)

*Author: Your Name*

This notebook loads the 311 dataset, performs EDA, and trains a baseline classifier.

In [None]:

# ---- Setup ----
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Make plots a bit nicer by default
pd.set_option('display.max_columns', 50)
plt.rcParams['figure.figsize'] = (8,5)

DATA_DIR = os.path.join('data', 'raw')
PROCESSED_DIR = os.path.join('data', 'processed')
RESULTS_DIR = 'results'
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Example source (update to the latest export you download from the portal)
DATA_URL = '<<PASTE_CSV_URL_FROM_PORTAL_OR_SAVE_CSV_TO_data/raw>>'
RAW_CSV = os.path.join(DATA_DIR, 'toronto_311.csv')


In [None]:

# ---- Load data ----
# If you already downloaded a CSV into data/raw, skip the download block.
# Example for manual load:
if os.path.exists(RAW_CSV):
    df = pd.read_csv(RAW_CSV, low_memory=False)
else:
    # If you have a direct CSV URL, you can do:
    # import requests
    # r = requests.get(DATA_URL)
    # open(RAW_CSV, 'wb').write(r.content)
    # df = pd.read_csv(RAW_CSV, low_memory=False)
    raise FileNotFoundError('Place a CSV in data/raw as toronto_311.csv or set DATA_URL and uncomment download code.')

print(df.shape)
df.head()


## Quick EDA

In [None]:

# Inspect columns and missingness
df.info()
df.isna().mean().sort_values(ascending=False).head(20)


In [None]:

# Example datetime parsing (customize column names to your CSV schema)
# Common columns: 'Created Date', 'Source', 'Division', 'Service Request Type', 'Ward', 'Neighbourhood'
date_cols = [c for c in df.columns if 'date' in c.lower() or 'created' in c.lower()]
for c in date_cols:
    try:
        df[c] = pd.to_datetime(df[c], errors='coerce')
    except Exception:
        pass

# Derive time features
if any(df.select_dtypes(include=['datetime64[ns]']).columns):
    dtc = df.select_dtypes(include=['datetime64[ns]']).columns[0]
    df['year'] = df[dtc].dt.year
    df['month'] = df[dtc].dt.month
    df['weekday'] = df[dtc].dt.weekday
    df['hour'] = df[dtc].dt.hour

# Basic plot: top request categories
cat_col_candidates = [c for c in df.columns if 'type' in c.lower() or 'category' in c.lower()]
if cat_col_candidates:
    top_col = cat_col_candidates[0]
    top = df[top_col].value_counts().head(10)
    ax = top.plot(kind='bar', title=f'Top 10 {top_col}')
    fig = ax.get_figure()
    fig.tight_layout()
    fig.savefig(os.path.join(RESULTS_DIR, 'top_categories.png'))
    plt.show()


## Baseline Classification

In [None]:

# Select target (top-k categories) and simple features
target_col = cat_col_candidates[0] if cat_col_candidates else None
if not target_col:
    raise ValueError('Could not infer a categorical target column. Pick one from your CSV.')

# Keep only top-10 frequent classes
top_classes = df[target_col].value_counts().head(10).index
df_cls = df[df[target_col].isin(top_classes)].copy()

# Feature candidates
feature_cols = []
for key in ['ward', 'neighbourhood', 'source', 'division']:
    matches = [c for c in df_cls.columns if key in c.lower()]
    feature_cols += matches

for key in ['year', 'month', 'weekday', 'hour']:
    if key in df_cls.columns:
        feature_cols.append(key)

feature_cols = list(dict.fromkeys(feature_cols))  # unique preserve order
print('Using features:', feature_cols)

X = df_cls[feature_cols]
y = df_cls[target_col].astype(str)

# Separate categorical vs numeric
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(exclude=['object']).columns.tolist()

preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

models = {
    'logreg': LogisticRegression(max_iter=200),
    'rf': RandomForestClassifier(n_estimators=200, random_state=42)
}

for name, clf in models.items():
    pipe = Pipeline([('prep', preprocess), ('clf', clf)])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f"\n=== {name.upper()} ===")
    print(classification_report(y_test, y_pred, zero_division=0))
