In [None]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import classification_report, roc_auc_score

In [2]:

# Toy binary dataset (all numeric for simplicity)
X, y = make_classification(n_samples=4000, n_features=40, n_informative=8,
                           n_redundant=8, random_state=0)

# 1) Split FIRST  → prevents leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=0
)

# 2) Fit MI feature selector on TRAIN only
k = 15  # number of features to keep (tune this)
selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_train_sel = selector.fit_transform(X_train, y_train)

# 3) Apply SAME mask to test (no refitting on test)
X_test_sel = selector.transform(X_test)

# 4) Train model on reduced features
clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000))
clf.fit(X_train_sel, y_train)

# 5) Evaluate
y_pred = clf.predict(X_test_sel)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       499
           1       0.92      0.88      0.90       501

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.91      0.91      0.91      1000



In [8]:
pipe = Pipeline(steps=[
    ("select", SelectKBest(score_func=mutual_info_classif, k=10)),
    ("scale", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000))
])
param_grid = {
    "select__k": [5, 10, 15, 20],
    "clf__C": [0.01, 0.1, 1, 10, 100]
}
gs = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)

# Fit the grid search on training data
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("CV score:", gs.best_score_)

y_proba = gs.predict_proba(X_test)[:, 1]
print("Test ROC AUC:", roc_auc_score(y_test, y_proba))

Best params: {'clf__C': 1, 'select__k': 20}
CV score: 0.8949999999999999
Test ROC AUC: 0.9635798543194174
