In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
df = pd.read_csv('/Users/gabriel/Desktop/marcy/nyc-mod6-project/data/DSNY_Monthly_Tonnage.csv')

# Create recycling ratio & target flag
df['recycling_ratio'] = (
    df['PAPERTONSCOLLECTED'].fillna(0) +
    df['MGPTONSCOLLECTED'].fillna(0)
) / df['REFUSETONSCOLLECTED'].replace({0: np.nan})

df['recycling_ratio'] = df['recycling_ratio'].fillna(0)
df['recycling_underperformance_flag'] = (df['recycling_ratio'] < 0.20).astype(int)

In [3]:
df['total_organics'] = df[
    ['RESORGANICSTONS', 'SCHOOLORGANICTONS', 'LEAVESORGANICTONS',
     'XMASTREETONS', 'OTHERORGANICSTONS']
].fillna(0).sum(axis=1)

numeric_features = [
    'REFUSETONSCOLLECTED',
    'PAPERTONSCOLLECTED',
    'MGPTONSCOLLECTED',
    'total_organics'
]
categorical_features = ['BOROUGH']

X = df[numeric_features + categorical_features]
y = df['recycling_underperformance_flag']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [4]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [5]:
pipe = Pipeline([
    ('preprocess', preprocess),
    ('select', SelectKBest(score_func=f_classif)),
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'select__k': [2, 3, 'all'],
    'clf__n_estimators': [100, 300],
    'clf__max_depth': [5, 10, None],
    'clf__min_samples_split': [2, 5],
    'clf__class_weight': ['balanced', None]
}

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

best = grid.best_estimator_

y_pred = best.predict(X_test)
y_proba = best.predict_proba(X_test)[:, 1]


Fitting 4 folds for each of 72 candidates, totalling 288 fits


In [6]:
results = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1': f1_score(y_test, y_pred, zero_division=0),
    'ROC_AUC': roc_auc_score(y_test, y_proba)
}

print("\nTuned Random Forest Model Metrics:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

print("\nBest Hyperparameters:")
print(grid.best_params_)


Tuned Random Forest Model Metrics:
Accuracy: 0.9764
Precision: 0.9789
Recall: 0.9731
F1: 0.9760
ROC_AUC: 0.9977

Best Hyperparameters:
{'clf__class_weight': None, 'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 100, 'select__k': 3}


### So that I understand: Hyperparameters control the behavior of the model; the param grid lists the hyperparameters you want to test so the computer can choose the best configuration automatically.

In [None]:
# --- SEPARATE FEATURE ENGINEERING ---
# Create Lag 1 month for refuse
df["refuse_lag1"] = df["REFUSETONSCOLLECTED"].shift(1)

# Create 3-month rolling average for refuse
df["refuse_roll3"] = df["REFUSETONSCOLLECTED"].rolling(window=3).mean()

# Drop NA rows created by shifting/rolling
df = df.dropna(subset=["refuse_lag1", "refuse_roll3"]).reset_index(drop=True)

df.head()

Unnamed: 0,MONTH,BOROUGH,COMMUNITYDISTRICT,REFUSETONSCOLLECTED,PAPERTONSCOLLECTED,MGPTONSCOLLECTED,RESORGANICSTONS,SCHOOLORGANICTONS,LEAVESORGANICTONS,XMASTREETONS,OTHERORGANICSTONS,BOROUGH_ID,recycling_ratio,recycling_underperformance_flag,total_organics,refuse_lag1,refuse_roll3
0,2022 / 01,Bronx,3,1984.9,122.1,155.3,,,,4.2,,2,0.139755,1,4.2,7412.7,4130.1
1,2022 / 01,Bronx,4,3688.2,275.0,341.8,,,,12.4,,2,0.167236,1,12.4,1984.9,4361.933333
2,2022 / 01,Bronx,5,3166.8,223.1,330.2,,,,7.6,,2,0.174719,1,7.6,3688.2,2946.633333
3,2022 / 01,Bronx,6,3039.1,208.7,207.5,,52.7,,9.2,,2,0.136948,1,61.9,3166.8,3298.033333
4,2022 / 01,Bronx,7,3506.5,308.0,479.4,,,,11.7,,2,0.224554,0,11.7,3039.1,3237.466667


### More understanding: shift(1) - Moves the entire column down by one row meaning each month now has the value of the previous month. rolling(3).mean() - Computes average refuse tonnage across the previous 3 months. dropna() - Rolling windows and lag introduce missing values at the top of the dataset