In [1]:
!pip -q install datasets

import numpy as np
import pandas as pd

from datasets import load_dataset

In [2]:
SEED = 42
np.random.seed(SEED)

In [3]:
ds = load_dataset("scikit-learn/adult-census-income")
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

adult.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/32561 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income'],
        num_rows: 32561
    })
})

In [4]:
train_df = ds["train"].to_pandas()

train_df.shape


(32561, 15)

In [5]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [6]:
target_col = "income"

train_df[target_col].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
income,Unnamed: 1_level_1
<=50K,0.75919
>50K,0.24081


In [7]:
train_df.isna().sum().sort_values(ascending=False).head(15)

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education.num,0
marital.status,0
occupation,0
relationship,0
race,0
sex,0


In [8]:
(train_df == "?").sum().sort_values(ascending=False).head(15)

Unnamed: 0,0
occupation,1843
workclass,1836
native.country,583
fnlwgt,0
education,0
education.num,0
age,0
marital.status,0
relationship,0
sex,0


In [9]:
cat_cols = train_df.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()

# remove target from feature lists
cat_cols = [c for c in cat_cols if c != target_col]
num_cols = [c for c in num_cols if c != target_col]

len(cat_cols), len(num_cols), cat_cols[:10], num_cols[:10]


(8,
 6,
 ['workclass',
  'education',
  'marital.status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native.country'],
 ['age',
  'fnlwgt',
  'education.num',
  'capital.gain',
  'capital.loss',
  'hours.per.week'])

In [10]:
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

In [11]:
from sklearn.model_selection import train_test_split

df = ds["train"].to_pandas()

target_col = "income"

X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_val.shape


((26048, 14), (6513, 14))

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [13]:
#Numeric pipeline imputing misssing + scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

#categorical pipeline impute missin + one-hot
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one-hot", OneHotEncoder(handle_unknown="ignore"))
])

#combine
preprocess = ColumnTransformer(
    transformers=[
             ("num", numeric_transformer, num_cols),
             ("cat", categorical_transformer, cat_cols),
         ]
)

In [14]:
clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=2000))
])

clf

In [15]:
clf.fit(X_train, y_train)


In [16]:
y_pred = clf.predict(X_val)
y_proba = clf.predict_proba(X_val)[:, 1]

In [17]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91      4945
        >50K       0.74      0.61      0.67      1568

    accuracy                           0.85      6513
   macro avg       0.81      0.77      0.79      6513
weighted avg       0.85      0.85      0.85      6513



In [18]:
confusion_matrix(y_val, y_pred)


array([[4608,  337],
       [ 612,  956]])

In [19]:
sorted(y_train.unique())

['<=50K', '>50K']

In [20]:
clf.named_steps["model"].classes_

array(['<=50K', '>50K'], dtype=object)

In [21]:
pos_label = ">50K"
pos_index = list(clf.named_steps["model"].classes_).index(pos_label)

y_proba_pos = clf.predict_proba(X_val)[:, pos_index]
print("ROC-AUC", roc_auc_score((y_val == pos_label).astype(int), y_proba_pos))

ROC-AUC 0.9043452466932171


In [22]:
from sklearn.model_selection import StratifiedKFold, cross_validate


In [23]:
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [25]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

pos_label = ">50K"

scoring = {
    "roc_auc": "roc_auc",  # uses predict_proba automatically; no pos_label issue
    "precision": make_scorer(precision_score, pos_label=pos_label),
    "recall": make_scorer(recall_score, pos_label=pos_label),
    "f1": make_scorer(f1_score, pos_label=pos_label),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(
    clf,
    X_train,
    y_train,
    cv=cv,
    scoring=scoring,
    return_train_score=False,
    error_score="raise"  # makes debugging easier if something else breaks
)

cv_results.keys()


dict_keys(['fit_time', 'score_time', 'test_roc_auc', 'test_precision', 'test_recall', 'test_f1'])

In [26]:
for m in ["roc_auc", "precision", "recall", "f1"]:
    scores = cv_results[f"test_{m}"]
    print(f"{m:>9}: mean={scores.mean():.4f}  std={scores.std():.4f}")


  roc_auc: mean=0.9070  std=0.0013
precision: mean=0.7326  std=0.0140
   recall: mean=0.6008  std=0.0110
       f1: mean=0.6601  std=0.0100


In [27]:
from sklearn.metrics import balanced_accuracy_score

scoring["bal_acc"] = make_scorer(balanced_accuracy_score)


In [28]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix

pos_label = ">50K"

# Fit baseline on full training split
clf.fit(X_train, y_train)

# Find positive class index safely
classes = clf.named_steps["model"].classes_
pos_index = list(classes).index(pos_label)

# Probabilities for positive class on validation
y_val_proba = clf.predict_proba(X_val)[:, pos_index]

# Convert y_val to binary (1 = >50K)
y_val_bin = (y_val == pos_label).astype(int)

y_val_proba[:5], y_val_bin[:5]


(array([0.11316739, 0.0013713 , 0.74260739, 0.24513427, 0.02636901]),
 10489    0
 25652    0
 12243    1
 25487    1
 5091     0
 Name: income, dtype: int64)

In [29]:
prec, rec, thr = precision_recall_curve(y_val_bin, y_val_proba)

# thr has length = len(prec)-1
len(prec), len(rec), len(thr)


(6511, 6511, 6510)

In [30]:
from sklearn.metrics import precision_score, recall_score, f1_score

def metrics_at_threshold(t):
    y_pred_t = (y_val_proba >= t).astype(int)
    return {
        "threshold": t,
        "precision": precision_score(y_val_bin, y_pred_t, zero_division=0),
        "recall": recall_score(y_val_bin, y_pred_t, zero_division=0),
        "f1": f1_score(y_val_bin, y_pred_t, zero_division=0),
        "pos_rate": y_pred_t.mean(),  # % predicted as >50K
    }

# pick some candidate thresholds
candidates = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
table = pd.DataFrame([metrics_at_threshold(t) for t in candidates])
table


Unnamed: 0,threshold,precision,recall,f1,pos_rate
0,0.2,0.538703,0.874362,0.666667,0.390757
1,0.3,0.612224,0.779337,0.685746,0.306464
2,0.4,0.679222,0.690051,0.684593,0.244588
3,0.5,0.739366,0.609694,0.668298,0.198526
4,0.6,0.797959,0.498724,0.613815,0.150468
5,0.7,0.848315,0.385204,0.529825,0.10932


In [31]:
target_recall = 0.75
valid = []

for t in np.linspace(0.05, 0.95, 91):
    m = metrics_at_threshold(t)
    if m["recall"] >= target_recall:
        valid.append(m)

pd.DataFrame(valid).sort_values(["precision", "f1"], ascending=False).head(10)


Unnamed: 0,threshold,precision,recall,f1,pos_rate
28,0.33,0.633584,0.753189,0.688228,0.286197
27,0.32,0.62703,0.763393,0.688525,0.293106
26,0.31,0.620513,0.771684,0.687891,0.299401
25,0.3,0.612224,0.779337,0.685746,0.306464
24,0.29,0.602827,0.788903,0.683425,0.315062
23,0.28,0.593928,0.798469,0.681175,0.32366
22,0.27,0.586191,0.80676,0.679012,0.331337
21,0.26,0.581406,0.817602,0.679565,0.338554
20,0.25,0.576752,0.829082,0.680272,0.346077
19,0.24,0.568704,0.836735,0.677161,0.354215


In [32]:
best = None
for t in np.linspace(0.05, 0.95, 91):
    m = metrics_at_threshold(t)
    if (best is None) or (m["f1"] > best["f1"]):
        best = m

best


{'threshold': np.float64(0.3499999999999999),
 'precision': 0.6497747747747747,
 'recall': 0.735969387755102,
 'f1': 0.6901913875598086,
 'pos_rate': np.float64(0.27268539843390144)}

In [33]:
t = best["threshold"]
y_pred_t = np.where(y_val_proba >= t, pos_label, "<=50K")

print("Threshold:", t)
print(classification_report(y_val, y_pred_t))
confusion_matrix(y_val, y_pred_t, labels=["<=50K", ">50K"])


Threshold: 0.3499999999999999
              precision    recall  f1-score   support

       <=50K       0.91      0.87      0.89      4945
        >50K       0.65      0.74      0.69      1568

    accuracy                           0.84      6513
   macro avg       0.78      0.81      0.79      6513
weighted avg       0.85      0.84      0.84      6513



array([[4323,  622],
       [ 414, 1154]])

In [34]:
from sklearn.ensemble import RandomForestClassifier


In [35]:
rf_clf = Pipeline(steps=[
    ("preprocess", preprocess),  # SAME preprocessing (important!)
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    ))
])

rf_clf


In [36]:
rf_clf.fit(X_train, y_train)

In [37]:
y_pred_rf = rf_clf.predict(X_val)


In [38]:
print(classification_report(y_val, y_pred_rf))
confusion_matrix(y_val, y_pred_rf)


              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      4945
        >50K       0.77      0.60      0.67      1568

    accuracy                           0.86      6513
   macro avg       0.82      0.77      0.79      6513
weighted avg       0.85      0.86      0.85      6513



array([[4662,  283],
       [ 634,  934]])

In [39]:
rf_classes = rf_clf.named_steps["model"].classes_
pos_index = list(rf_classes).index(">50K")

y_proba_rf = rf_clf.predict_proba(X_val)[:, pos_index]

roc_auc_score((y_val == ">50K").astype(int), y_proba_rf)


np.float64(0.9127502011927117)

In [40]:
cv_results_rf = cross_validate(
    rf_clf,
    X_train,
    y_train,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)

for m in ["roc_auc", "precision", "recall", "f1"]:
    scores = cv_results_rf[f"test_{m}"]
    print(f"{m:>9}: mean={scores.mean():.4f}  std={scores.std():.4f}")


  roc_auc: mean=0.9168  std=0.0019
precision: mean=0.7751  std=0.0153
   recall: mean=0.6050  std=0.0076
       f1: mean=0.6795  std=0.0075


In [41]:
import pandas as pd

summary = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "ROC-AUC (CV)": [
        f"{cv_results['test_roc_auc'].mean():.4f} ± {cv_results['test_roc_auc'].std():.4f}",
        f"{cv_results_rf['test_roc_auc'].mean():.4f} ± {cv_results_rf['test_roc_auc'].std():.4f}",
    ],
    "Precision (>50K)": [
        f"{cv_results['test_precision'].mean():.4f}",
        f"{cv_results_rf['test_precision'].mean():.4f}",
    ],
    "Recall (>50K)": [
        f"{cv_results['test_recall'].mean():.4f}",
        f"{cv_results_rf['test_recall'].mean():.4f}",
    ],
    "F1 (>50K)": [
        f"{cv_results['test_f1'].mean():.4f}",
        f"{cv_results_rf['test_f1'].mean():.4f}",
    ],
})

summary


Unnamed: 0,Model,ROC-AUC (CV),Precision (>50K),Recall (>50K),F1 (>50K)
0,Logistic Regression,0.9070 ± 0.0013,0.7326,0.6008,0.6601
1,Random Forest,0.9168 ± 0.0019,0.7751,0.605,0.6795
