## Text and Category Processing Tasks

### Imports

In [2]:
import pandas as pd
import numpy as np
import time
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score, classification_report

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

### Load Cleaned Data

In [3]:
df = pd.read_csv("../data/processed/philly_311_cleaned.csv")
df.head()

Unnamed: 0,service_request_id,subject,requested_datetime,service_name,service_code,service_notice,address,zipcode,lat,lon,status,agency_responsible,service_category,near_duplicate
0,17346520,Graffiti Removal,2025-01-01 00:00:34+00:00,GRAFFITI REMOVAL,SR-CL01,7 Business Days,1701 SPRING GARDEN ST,19130.0,39.963169,-75.166457,CLOSED,COMMUNITY LIFE IMPROVEMENT PROGRAM,ENVIRONMENTAL,False
1,17346521,Graffiti Removal,2025-01-01 00:03:51+00:00,GRAFFITI REMOVAL,SR-CL01,7 Business Days,1901 SPRING GARDEN ST,19130.0,39.963625,-75.1695,CLOSED,COMMUNITY LIFE IMPROVEMENT PROGRAM,ENVIRONMENTAL,False
2,17346523,Recycling Collection,2025-01-01 00:06:29+00:00,RUBBISH/RECYCLABLE MATERIAL COLLECTION,SR-ST03,2 Business Days,5902 JEFFERSON ST,19151.0,39.978809,-75.239265,CLOSED,STREETS DEPARTMENT,OTHER,False
3,17346524,Graffiti Removal,2025-01-01 00:06:47+00:00,GRAFFITI REMOVAL,SR-CL01,7 Business Days,1921 SPRING GARDEN ST,19130.0,39.963703,-75.170116,CLOSED,COMMUNITY LIFE IMPROVEMENT PROGRAM,ENVIRONMENTAL,False
4,17346525,Recycling Collection,2025-01-01 00:07:43+00:00,RUBBISH/RECYCLABLE MATERIAL COLLECTION,SR-ST03,2 Business Days,6739 RUTLAND ST,19149.0,40.043401,-75.072138,CLOSED,STREETS DEPARTMENT,OTHER,False


In [4]:
print(df.shape)
df.info()

(245808, 14)
<class 'pandas.DataFrame'>
RangeIndex: 245808 entries, 0 to 245807
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   service_request_id  245808 non-null  int64  
 1   subject             245808 non-null  str    
 2   requested_datetime  245808 non-null  str    
 3   service_name        245808 non-null  str    
 4   service_code        177736 non-null  str    
 5   service_notice      230362 non-null  str    
 6   address             245808 non-null  str    
 7   zipcode             241456 non-null  float64
 8   lat                 245808 non-null  float64
 9   lon                 245808 non-null  float64
 10  status              245808 non-null  str    
 11  agency_responsible  245808 non-null  str    
 12  service_category    245808 non-null  str    
 13  near_duplicate      245808 non-null  bool   
dtypes: bool(1), float64(3), int64(1), str(9)
memory usage: 24.6 MB


### Task # 1

**Classify complaint descriptions** into into standardized categories using traditional ML 
models (e.g., logistic regression, SVM, random forest) 

#### Define X and Y

X (feature): `subject`

y (target): `service_name`

In [5]:
X = df["subject"]
y = df["service_name"]

print(sum(X.isna()))
print(sum(y.isna()))

print(y.value_counts().head())

0
0
service_name
MAINTENANCE COMPLAINT                     37800
RUBBISH/RECYCLABLE MATERIAL COLLECTION    36943
ABANDONED VEHICLE                         27943
ILLEGAL DUMPING                           22644
STREET DEFECT                             16128
Name: count, dtype: int64


#### Train and Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

#### Pipeline

**TF-IDF Notes:**

Convert text to features using Term Frequency-Inverse Document Frequency (TF-IDF), which transforms text into a numerical matrix.

TF-IDF buils a vocabulary of all words, removing stop words. Then it creates a table so each complaint is represented by numbers. TF asks how often does a word appear in this complaint. IDF ask if this word is common across all complaints. For example, if every complaint has the word 'street', it is down-weighted. If a word is rare and specific, it gets up-weighted. 

In [7]:
pipelines = {
    "Logistic Regression": Pipeline([
        ("tfidf", TfidfVectorizer(stop_words= "english", max_features= 5000)),
        ("clf", LogisticRegression(max_iter = 2000, class_weight = "balanced"))
    ]),
    "Linear SVM": Pipeline([
        ("tfidf", TfidfVectorizer(stop_words= "english", max_features= 5000)),
        ("clf", LinearSVC(multi_class='ovr', class_weight = "balanced"))
    ]),
    "Random Forest": Pipeline([
        ("tfidf", TfidfVectorizer(stop_words= "english", max_features= 5000)),
        ("clf", RandomForestClassifier(
            n_estimators = 300, random_state = 50, n_jobs = -1, class_weight = "balanced"
        ))
     ])
    # ,
    # "KNN": Pipeline([
    #     ("tfidf", TfidfVectorizer(stop_words= "english", max_features= 5000)),
    #     ("clf", KNeighborsClassifier(n_neighbors=3))
    # ])
}

#### Model Evaluation Function

In [8]:
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    # training time
    t0 = time.perf_counter()
    model.fit(X_train, y_train)
    train_time = time.perf_counter() - t0

    # prediction time
    t1 = time.perf_counter()
    y_pred = model.predict(X_test)
    predict_time = time.perf_counter() - t1

    # metrics
    acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average = "macro")
    weighted_f1 = f1_score(y_test, y_pred, average = "weighted")

    return {
        "Model": name,
        "Accuracy": acc,
        "Macro-F1": macro_f1,
        "Weighted-F1": weighted_f1,
        "Train Time (s)": train_time,
        "Predict Time (s)": predict_time}, y_pred


#### Run all models and build results table

In [9]:
rows = []
preds = {}

for name, model in pipelines.items():
    row, y_pred = evaluate_model(name, model, X_train, y_train, X_test, y_test)
    rows.append(row)
    preds[name] = y_pred

results = pd.DataFrame(rows).sort_values(by = "Macro-F1", ascending = False).round(4)
results

Unnamed: 0,Model,Accuracy,Macro-F1,Weighted-F1,Train Time (s),Predict Time (s)
1,Linear SVM,0.9739,0.884,0.9747,130.2326,0.2578
2,Random Forest,0.9728,0.8785,0.9741,11.5241,2.4327
0,Logistic Regression,0.9711,0.8736,0.9731,18.8317,0.3062


#### Best Model

In [10]:
best_name = results.iloc[0]["Model"]
print("Best Model:", best_name)
print(classification_report(y_test, preds[best_name]))

Best Model: Linear SVM
                                        precision    recall  f1-score   support

                        ABANDONED BIKE       1.00      1.00      1.00        15
                     ABANDONED VEHICLE       1.00      1.00      1.00      5637
                    ALLEY LIGHT OUTAGE       0.97      0.94      0.96       233
                   COMPLAINT (STREETS)       0.25      1.00      0.41       217
        COMPLAINTS AGAINST FIRE OR EMS       1.00      1.00      1.00        14
               CONSTRUCTION COMPLAINTS       1.00      1.00      1.00      1212
          DANGEROUS BUILDING COMPLAINT       1.00      1.00      1.00       411
                    DANGEROUS SIDEWALK       0.91      0.92      0.92       684
                 DEAD ANIMAL IN STREET       0.99      1.00      1.00       193
             DIGITAL NAVIGATOR REQUEST       1.00      1.00      1.00        54
                    DUMPSTER VIOLATION       1.00      1.00      1.00       148
                

#### Results Discussion

Linear SVM achieved the highest macro-F1 score (0.8840 s), accuracy (0.9739 s), and weighted-F1 score (0.9747 s), which demonstrates the Linear SVM model has more balanced performance across classes. However, the training time of 285.2281 seconds is much slower than the traning time of Random Forest (15.3875 s) and Logistic Regression (17.9625 s). Taking into account the metrics and execution time, Logistic or Random Forest would be a better choice since there is not a big difference in performance between the models (the biggest difference is 0.0105), but there is a big difference in training time.

---

### Task # 2

**Estimate severity or sentiment** using lexicon-based methods (e.g., VADER, TextBlob) 
and simple ML classifiers trained on labeled subsets

#### Part 1: Lexicon-based Senitment using VADER and TextBlob

In [11]:
analyzer = SentimentIntensityAnalyzer()

# create a sentiment score column in the df
df["sentiment_score_vader"] = df["subject"].apply(
    lambda x: analyzer.polarity_scores(str(x))["compound"]
)

df["sentiment_score_textblob"] = df["subject"].apply(
    lambda x: TextBlob(str(x)).sentiment.polarity
)

# compound score is between -1 (very negative), 0 (neutral), +1 (very positive)

In [12]:
df["sentiment_score_vader"].describe().round(3)

count    245808.000
mean         -0.238
std           0.259
min          -0.802
25%          -0.459
50%          -0.296
75%           0.000
max           0.796
Name: sentiment_score_vader, dtype: float64

In [13]:
df["sentiment_score_textblob"].describe().round(3)

count    245808.000
mean         -0.108
std           0.214
min          -0.800
25%          -0.300
50%           0.000
75%           0.000
max           0.700
Name: sentiment_score_textblob, dtype: float64

Note: use VADER instead of TextBlob since the complaints should be more negative, which is reflected in VADER better than TextBlob. The `sentiment_severity_textblob` column will be removed from the nyc_clean data.

In [14]:
df = df.drop(columns=["sentiment_score_textblob"])
df.head()

Unnamed: 0,service_request_id,subject,requested_datetime,service_name,service_code,service_notice,address,zipcode,lat,lon,status,agency_responsible,service_category,near_duplicate,sentiment_score_vader
0,17346520,Graffiti Removal,2025-01-01 00:00:34+00:00,GRAFFITI REMOVAL,SR-CL01,7 Business Days,1701 SPRING GARDEN ST,19130.0,39.963169,-75.166457,CLOSED,COMMUNITY LIFE IMPROVEMENT PROGRAM,ENVIRONMENTAL,False,0.0
1,17346521,Graffiti Removal,2025-01-01 00:03:51+00:00,GRAFFITI REMOVAL,SR-CL01,7 Business Days,1901 SPRING GARDEN ST,19130.0,39.963625,-75.1695,CLOSED,COMMUNITY LIFE IMPROVEMENT PROGRAM,ENVIRONMENTAL,False,0.0
2,17346523,Recycling Collection,2025-01-01 00:06:29+00:00,RUBBISH/RECYCLABLE MATERIAL COLLECTION,SR-ST03,2 Business Days,5902 JEFFERSON ST,19151.0,39.978809,-75.239265,CLOSED,STREETS DEPARTMENT,OTHER,False,0.0
3,17346524,Graffiti Removal,2025-01-01 00:06:47+00:00,GRAFFITI REMOVAL,SR-CL01,7 Business Days,1921 SPRING GARDEN ST,19130.0,39.963703,-75.170116,CLOSED,COMMUNITY LIFE IMPROVEMENT PROGRAM,ENVIRONMENTAL,False,0.0
4,17346525,Recycling Collection,2025-01-01 00:07:43+00:00,RUBBISH/RECYCLABLE MATERIAL COLLECTION,SR-ST03,2 Business Days,6739 RUTLAND ST,19149.0,40.043401,-75.072138,CLOSED,STREETS DEPARTMENT,OTHER,False,0.0


Now the sentiment will be classified into severity categories for the ML classifier:

In [15]:
def severity_label(score):
    if score < -0.35:
        return "High Severity"
    elif score < 0:
        return "Medium Severity"
    else:
        return "Low Severity"
    
df["severity_label"] = df["sentiment_score_vader"].apply(severity_label)

Since the complaints are mostly neutral to slightly negative, the thresholds were defined based on the distribution of the sentiment scores. Complains with a sentiment score of <= -0.35 were labeled as `High Severity`, those between -0.35 and 0 were labeled as `Medium Severity`, and all non-negative scores were labeled as `Low Severity`.

In [16]:
df["severity_label"].value_counts()

severity_label
Low Severity       105427
High Severity       79409
Medium Severity     60972
Name: count, dtype: int64

#### Part 2: Train ML Classifier on Labeled Subset

The goal is to predict the severity of a complaint using ML instead of Lexicon. Since there are no severity labels, the lexicon labels will be used as weak supervision.

In [17]:
# Note: X is already defined in the code above

y = df["severity_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

#### Build model pipeline, train, predict, & evaluate

Note: Logistic Regression was selected for the model since the relationship is mostly linear (negative words -> high severity, neutral words -> low severity), interpretable, and computationally efficient.

In [18]:
severity_model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words= "english", max_features= 5000)),
    ("clf", LogisticRegression(max_iter = 1000, class_weight = "balanced", solver = "lbfgs"))
])

severity_model.fit(X_train, y_train)

y_pred = severity_model.predict(X_test)

print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

  High Severity       1.00      1.00      1.00     15867
   Low Severity       1.00      1.00      1.00     21109
Medium Severity       1.00      1.00      1.00     12186

       accuracy                           1.00     49162
      macro avg       1.00      1.00      1.00     49162
   weighted avg       1.00      1.00      1.00     49162



#### Results Discussion

The severity classifier achieved perfect performance across all three severity categories. The overall accuracy was 1.00, with a macro-F1 score of 1.0. This indicates balanced performance across classes. 

These results suggest that the severity categories that were classified from the VADER sentiment scores are very learnable using TF-IDF features and a linear classifier. However, the labels for this data were generated using a lexicon-based rule system rather than manually assigning labels. The model's strong performance reflects its ability to replace sentiment-based rules rather than discover independent severity signals. 

In [19]:
output_path = "../data/processed/philly_311_cleaned.csv"

df.to_csv(output_path, index=False)

print(f"✓ Cleaned data saved to: {output_path}")

✓ Cleaned data saved to: ../data/processed/philly_311_cleaned.csv
