In [1]:
# ===============================
# 1. Import Libraries
# ===============================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [5]:
# ===============================
# 2. Load Dataset
# ===============================
# Replace with your own dataset
df = pd.read_csv("./data/dataset.csv")

# Target column name
TARGET = "Label"

X = df.drop(columns=[TARGET])
y = df[TARGET]

print("Original feature count:", X.shape[1])

Original feature count: 1420


In [6]:
# ===============================
# 3. Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
# ===============================
# 4. Baseline Model (No Reduction)
# ===============================
baseline_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])

baseline_pipeline.fit(X_train, y_train)
baseline_pred = baseline_pipeline.predict(X_test)

baseline_acc = accuracy_score(y_test, baseline_pred)

In [8]:
# ===============================
# 5. Feature Selection Model
# ===============================
# Select top K important features
K = 100  # change according to your dataset size

fs_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("feature_selection", SelectKBest(score_func=f_classif, k=K)),
    ("clf", LogisticRegression(max_iter=1000))
])

fs_pipeline.fit(X_train, y_train)
fs_pred = fs_pipeline.predict(X_test)

fs_acc = accuracy_score(y_test, fs_pred)

   17   19   20   21   22   23   25   26   27   28   29   31   32   33
   34   35   37   38   39   40   41   43   44   45   46   47   49   50
   51   52   53  217  218  219  220  221  223  224  225  226  227  229
  230  231  232  233  235  236  237  238  239  241  242  243  244  245
  247  248  249  250  251  253  254  255  256  257  259  260  261  262
  263  265  266  267  268  269  864  865  866  867  868  869 1104 1105
 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1176 1177 1178 1179
 1180 1181 1182 1183 1184 1185 1186 1187 1190 1192 1193] are constant.
  f = msb / msw


In [9]:
# ===============================
# 6. PCA Model
# ===============================
# Reduce to K principal components
pca_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=K)),
    ("clf", LogisticRegression(max_iter=1000))
])

pca_pipeline.fit(X_train, y_train)
pca_pred = pca_pipeline.predict(X_test)

pca_acc = accuracy_score(y_test, pca_pred)

In [10]:
# ===============================
# 7. Results Comparison
# ===============================
print("\n===== Model Comparison =====")
print(f"Baseline Accuracy        : {baseline_acc:.4f}")
print(f"Feature Selection Acc    : {fs_acc:.4f}")
print(f"PCA Accuracy             : {pca_acc:.4f}")



===== Model Comparison =====
Baseline Accuracy        : 0.9831
Feature Selection Acc    : 0.9792
PCA Accuracy             : 0.9831
