In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.inspection import permutation_importance
from sklearn.metrics.pairwise import cosine_similarity

In [2]:

# -----------------------------
# 1. Data Collection + Preprocessing
# -----------------------------
def load_data(path="prep.csv", target="classification_yes"):
    dataset = pd.read_csv(path)
    df = pd.get_dummies(dataset, drop_first=True)   # encode categorical
    X = df.drop(target, axis=1)
    y = df[target]
    return X, y


def split_and_scale(X, y, test_size=0.25):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=0
    )
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled


In [3]:

# -----------------------------
# 2. Feature Selection
# -----------------------------
def run_feature_selection(X_train, y_train, X_test, y_test):
    # Chi-square
    chi_selector = SelectKBest(score_func=chi2, k="all")
    chi_selector.fit(abs(X_train), y_train)
    chi_scores = chi_selector.scores_

    # Logistic Regression Coeff
    log_reg = LogisticRegression(max_iter=1000, solver="liblinear")
    log_reg.fit(X_train, y_train)
    log_coeff = np.abs(log_reg.coef_[0])

    # Random Forest importance
    rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf.fit(X_train, y_train)
    rf_importance = rf.feature_importances_

    # Permutation Importance
    perm = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=0)
    perm_importance = perm.importances_mean

    # RFE
    rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000, solver="liblinear"), 
                       n_features_to_select=1)
    rfe_selector.fit(X_train, y_train)
    rfe_ranking = (max(rfe_selector.ranking_) - rfe_selector.ranking_) + 1

    # Combine
    importance_df = pd.DataFrame({
        "Feature": X_train.columns,
        "Chi2": chi_scores,
        "Logistic_Coeff": log_coeff,
        "RandomForest": rf_importance,
        "Permutation": perm_importance,
        "RFE": rfe_ranking
    }).set_index("Feature")

    return importance_df


In [4]:

# -----------------------------
# 3. Define Models
# -----------------------------
def get_models():
    models = {
        "Logistic": LogisticRegression(max_iter=1000, solver="liblinear"),
        "SVMl": SVC(kernel="linear", probability=True),
        "SVMnl": SVC(kernel="rbf", probability=True),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "Navie": GaussianNB(),
        "Decision": DecisionTreeClassifier(random_state=0),
        "Random": RandomForestClassifier(n_estimators=100, random_state=0)
    }
    return models

In [5]:
# -----------------------------
# 4. Get Feature Importances
# -----------------------------
def get_feature_importances(models, X_train, X_test, y_train, y_test, scaled=True):
    feature_importances = {}

    for name, model in models.items():
        # Scale only for models needing it
        if name in ["Logistic", "SVMl", "SVMnl"]:
            model.fit(X_train, y_train)
        else:
            model.fit(X_train.values, y_train)

        if name in ["Logistic", "SVMl"]:
            importance = np.abs(model.coef_[0])
        elif name in ["Random", "Decision"]:
            importance = model.feature_importances_
        else:  # SVMnl, KNN, Naive → permutation
            r = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=0)
            importance = r.importances_mean

        feature_importances[name] = importance

    return feature_importances

In [6]:
# -----------------------------
# 5. Compare Importances → DataFrame
# -----------------------------
def compare_importances(feature_importances):
    row_models = ["Logistic", "SVC", "Random", "DecisionTree"]
    col_models = ["Logistic", "SVMl", "SVMnl", "KNN", "Navie", "Decision", "Random"]

    matrix = pd.DataFrame(index=row_models, columns=col_models)

    # Map aliases
    mapping = {"SVC": "SVMnl", "DecisionTree": "Decision"}

    for r in row_models:
        r_key = mapping.get(r, r)
        for c in col_models:
            c_key = mapping.get(c, c)
            sim = cosine_similarity(
                [feature_importances[r_key]], [feature_importances[c_key]]
            )[0][0]
            matrix.loc[r, c] = round(sim, 2)

    return matrix


In [7]:
# -----------------------------
# 6. Master Function (Pipeline)
# -----------------------------
def run_pipeline():
    # Load + split
    X, y = load_data()
    X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled = split_and_scale(X, y)

    # Feature Selection
    feature_df = run_feature_selection(pd.DataFrame(X_train, columns=X.columns), y_train,
                                       pd.DataFrame(X_test, columns=X.columns), y_test)

    # Models
    models = get_models()

    # Feature Importances
    feature_importances = get_feature_importances(
        models, pd.DataFrame(X_train_scaled, columns=X.columns),
        pd.DataFrame(X_test_scaled, columns=X.columns),
        y_train, y_test
    )

    # Compare → Final Matrix
    similarity_matrix = compare_importances(feature_importances)
    return feature_df, similarity_matrix




In [8]:
# -----------------------------
# Run
# -----------------------------
feature_df, final_result = run_pipeline()

print("\n--- Feature Selection Results ---")
print(feature_df.head())

print("\n--- Final Similarity DataFrame ---")
print(final_result)




--- Feature Selection Results ---
                Chi2  Logistic_Coeff  RandomForest  Permutation  RFE
Feature                                                             
age        79.874936        0.007350      0.010809        0.007    4
bp         70.347325        0.062078      0.008730        0.004    6
al        150.356757        1.292601      0.053595        0.006   23
su         67.167568        0.187523      0.004155        0.001   17
bgr      1730.291493        0.033503      0.060795        0.003    5

--- Final Similarity DataFrame ---
             Logistic  SVMl SVMnl   KNN Navie Decision Random
Logistic          1.0  0.92  0.73  0.79  0.59     0.51   0.77
SVC              0.73  0.63   1.0  0.73  0.51     0.39   0.53
Random           0.77  0.71  0.53  0.41  0.24     0.63    1.0
DecisionTree     0.51  0.44  0.39  0.21  0.03      1.0   0.63
