In [19]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [20]:
# ---- 1) Tiny example dataset (your schema)
data = [
    # Id,  name,    sleep, social_media, stress_result
    [ 1,  "Aaron",    8, 1, "Low"    ],
    [ 2,  "Ben",      6, 2, "Medium" ],
    [ 3,  "Cara",     5, 5, "High"   ],
    [ 4,  "Dina",     7, 2, "Low"    ],
    [ 5,  "Evan",     6, 3, "Medium" ],
    [ 6,  "Faye",     4, 6, "High"   ],
    [ 7,  "Gus",      8, 0, "Low"    ],
    [ 8,  "Hana",     5, 4, "High"   ],
    [ 9,  "Ivan",     7, 2, "Medium" ],
    [10,  "Jade",     6, 1, "Low"    ],
    [11,  "Kyle",     7, 3, "Medium" ],
    [12,  "Lina",     5, 6, "High"   ],
    [13,  "Mia",      8, 2, "Low"    ],
    [14,  "Noah",     6, 4, "Medium" ],
    [15,  "Omar",     5, 5, "High"   ],
    [16,  "Pia",      7, 1, "Low"    ],
    [17,  "Quin",     6, 3, "Medium" ],
    [18,  "Rita",     4, 7, "High"   ],
    [19,  "Sam",      8, 0, "Low"    ],
    [20,  "Tina",     5, 4, "High"   ],
    [21,  "Uma",      7, 2, "Medium" ],
    [22,  "Vik",      6, 3, "Low"    ],
    [23,  "Wes",      5, 6, "High"   ],
    [24,  "Xena",     8, 1, "Medium" ],
    [25,  "Yara",     7, 2, "Low"    ],
    [26,  "Zane",     6, 5, "High"   ],
    [27,  "Alan",     4, 7, "Medium" ],
    [28,  "Bella",    8, 2, "Low"    ],
    [29,  "Chris",    7, 4, "Medium" ],
    [30,  "Dana",     5, 5, "High"   ],
    [31,  "Eli",      6, 1, "Low"    ],
    [32,  "Fiona",    7, 3, "Medium" ],
    [33,  "Gina",     5, 6, "High"   ],
    [34,  "Hugo",     8, 0, "Low"    ],
    [35,  "Ivy",      6, 4, "Medium" ],
    [36,  "Jon",      5, 5, "High"   ],
    [37,  "Kara",     7, 2, "Low"    ],
    [38,  "Liam",     6, 3, "Medium" ],
    [39,  "Mona",     4, 7, "High"   ],
    [40,  "Nina",     8, 1, "Low"    ],
    [41,  "Owen",     7, 2, "Medium" ],
    [42,  "Paul",     6, 5, "High"   ],
    [43,  "Queen",    5, 6, "Medium" ],
    [44,  "Ralph",    8, 2, "Low"    ],
    [45,  "Sara",     7, 4, "Medium" ],
    [46,  "Tom",      5, 5, "High"   ],
    [47,  "Ursula",   6, 1, "Low"    ],
    [48,  "Victor",   7, 3, "Medium" ],
    [49,  "Wendy",    5, 6, "High"   ],
    [50,  "Xander",   8, 0, "Low"    ],
]

cols = ["Id", "name", "sleep", "social_media", "stress_result"]
df = pd.DataFrame(data, columns=cols)

df.head()

Unnamed: 0,Id,name,sleep,social_media,stress_result
0,1,Aaron,8,1,Low
1,2,Ben,6,2,Medium
2,3,Cara,5,5,High
3,4,Dina,7,2,Low
4,5,Evan,6,3,Medium


In [21]:
# X = features (as you listed), y = multiclass target
X = df[["Id", "name", "sleep", "social_media"]]
y = df["stress_result"]

# Define which columns are numeric/categorical for preprocessing
all_num = ["Id", "sleep", "social_media"]
all_cat = ["name"]

In [None]:
# ---- 2) Helper: build a pipeline for a given feature subset and score it
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [22]:
models = {
    "LogReg": LogisticRegression(max_iter=2000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVC": SVC(kernel="rbf", probability=True, random_state=42),
    "DecisionTree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "GradBoost": GradientBoostingClassifier(random_state=42),
}

In [23]:
# --- 3) Scoring helper with automatic preprocessing per chosen columns
def score_cols(estimator, cols):
    num_cols = [c for c in cols if c in all_num]
    cat_cols = [c for c in cols if c in all_cat]

    transformers = []
    if num_cols:
        transformers.append(("num", StandardScaler(), num_cols))
    if cat_cols:
        transformers.append(("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols))

    pre = ColumnTransformer(transformers)
    pipe = Pipeline([("pre", pre), ("clf", estimator)])
    return cross_val_score(pipe, X[cols], y, cv=cv, scoring="accuracy").mean()

# --- 4) Simple greedy forward selection (reusable for any model)
def forward_select_for_model(estimator, all_cols, min_improve=0.0):
    selected, best, remaining = [], 0.0, list(all_cols)
    history = []

    while remaining:
        trials = []
        for feat in remaining:
            cols_try = selected + [feat]
            acc = score_cols(estimator, cols_try)
            trials.append((acc, feat))
        trials.sort(reverse=True)  # by accuracy

        acc_best, feat_best = trials[0]
        if acc_best <= best + min_improve:
            break

        selected.append(feat_best)
        remaining.remove(feat_best)
        best = acc_best
        history.append((feat_best, best))

    return selected, best, history

# --- 5) Run forward selection for EACH model and compare
results = []
for name, est in models.items():
    selected, best_acc, hist = forward_select_for_model(est, X.columns.tolist(), min_improve=0.0)
    results.append((name, best_acc, selected, hist))

# --- 6) Print a concise comparison
print("=== Forward Selection Comparison ===")
for name, best_acc, selected, hist in results:
    print(f"\nModel: {name}")
    print("  Add order (feature -> CV acc):")
    for feat, acc in hist:
        print(f"   - {feat:>12s} -> {acc:.3f}")
    print(f"  Selected: {selected}")
    print(f"  Best 5-fold CV accuracy: {best_acc:.3f}")

=== Forward Selection Comparison ===

Model: LogReg
  Add order (feature -> CV acc):
   - social_media -> 0.800
  Selected: ['social_media']
  Best 5-fold CV accuracy: 0.800

Model: RandomForest
  Add order (feature -> CV acc):
   - social_media -> 0.760
   -        sleep -> 0.800
  Selected: ['social_media', 'sleep']
  Best 5-fold CV accuracy: 0.800

Model: SVC
  Add order (feature -> CV acc):
   - social_media -> 0.740
   -        sleep -> 0.820
   -         name -> 0.860
  Selected: ['social_media', 'sleep', 'name']
  Best 5-fold CV accuracy: 0.860

Model: DecisionTree
  Add order (feature -> CV acc):
   - social_media -> 0.760
  Selected: ['social_media']
  Best 5-fold CV accuracy: 0.760

Model: GradBoost
  Add order (feature -> CV acc):
   - social_media -> 0.680
   -         name -> 0.760
  Selected: ['social_media', 'name']
  Best 5-fold CV accuracy: 0.760
