In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [6]:
df = pd.read_csv('/Users/gabriel/Desktop/marcy/nyc-mod6-project/data/DSNY_Monthly_Tonnage.csv')

# Create recycling ratio & target flag
df['recycling_ratio'] = (
    df['PAPERTONSCOLLECTED'].fillna(0) +
    df['MGPTONSCOLLECTED'].fillna(0)
) / df['REFUSETONSCOLLECTED'].replace({0: np.nan})

df['recycling_ratio'] = df['recycling_ratio'].fillna(0)
df['recycling_underperformance_flag'] = (df['recycling_ratio'] < 0.20).astype(int)

In [7]:
numeric_features = ['REFUSETONSCOLLECTED', 'PAPERTONSCOLLECTED', 'MGPTONSCOLLECTED']
categorical_features = ['BOROUGH']

X = df[numeric_features + categorical_features]
y = df['recycling_underperformance_flag']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Preprocessing
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [8]:
model = Pipeline([
    ('preprocess', preprocess),
    ('clf', DummyClassifier(strategy='most_frequent'))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]


In [9]:
results = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1': f1_score(y_test, y_pred, zero_division=0),
    'ROC_AUC': roc_auc_score(y_test, y_proba)
}

print("Baseline Model Metrics")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

Baseline Model Metrics
Accuracy: 0.5081
Precision: 0.0000
Recall: 0.0000
F1: 0.0000
ROC_AUC: 0.5000
