# Pipeline for feature importance

The idea of this pipeline is to select significant features from a list of features. Presented with a series of rules for feature evaluation the pipeline will run through all of these and record the performance of the models and the importance of features in determining the decisions of the models. Then we should be able to evaluate that data to decide what to include in our model for general prediction. It could be that from each group of features one is particularly important it could also be that there is interaction between groups that makes this significant.

In [45]:
import pandas as pd
import numpy as np
from os import path
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process.kernels import RBF
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    accuracy_score,
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import shap
from sklearn.preprocessing import StandardScaler
import regex as re

## Load and Process Data

In [46]:
data_file = path.join("..", "data", "zoonosis_dataset_full.csv")
target_column = "label"

In [47]:
def prepare_dataframe_for_ml(df, target_column=None, one_hot_encode=True):
    """
    Prepare a pandas DataFrame for machine learning algorithms.
    - Normalizes numerical features
    - Optionally one-hot encodes categorical features
    - Optionally separates target variable

    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame to prepare
    target_column : str, optional
        Name of the target column to separate
    one_hot_encode : bool, optional
        Whether to one-hot encode categorical features

    Returns:
    --------
    df_processed: pandas.DataFrame
        The processed DataFrame
    """

    # Create a copy of the dataframe to avoid modifying the original
    df_processed = df.copy()

    # Separate target if specified
    y = None
    if target_column and target_column in df_processed.columns:
        y = df_processed[target_column].replace({"nz": 0, "hzoon": 1})
        df_processed = df_processed.drop(columns=[target_column])

    # Identify numerical and categorical columns
    numerical_cols = df_processed.select_dtypes(
        include=["int64", "float64"]
    ).columns.tolist()
    categorical_cols = df_processed.select_dtypes(
        include=["object", "category", "bool"]
    ).columns.tolist()

    # Handle missing values
    df_processed[numerical_cols] = df_processed[numerical_cols].fillna(
        df_processed[numerical_cols].median()
    )
    for col in categorical_cols:
        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

    # Normalize numerical features
    if numerical_cols:
        scaler = StandardScaler()
        df_processed[numerical_cols] = scaler.fit_transform(
            df_processed[numerical_cols]
        )

    # One-hot encode categorical features
    if categorical_cols and one_hot_encode:
        df_processed = pd.get_dummies(
            df_processed, columns=categorical_cols, drop_first=False
        )

    # If we have a target column, add it back to the processed dataframe
    if target_column and y is not None:
        df_processed[target_column] = y

    return df_processed

In [48]:
data = pd.read_csv(data_file)
processed_data = prepare_dataframe_for_ml(
    data, target_column=target_column
)
y = processed_data["label"]
X = processed_data.drop(columns=["label"])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

  y = df_processed[target_column].replace({"nz": 0, "hzoon": 1})


## Dataset Splitting Rules

## Model Definitions

In [49]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

In [50]:
models = [
    ("XGBoost", XGBClassifier(enable_categorical=True)),
    # ("Random Forest", RandomForestClassifier()),
    # ("Ridge Classifier", RidgeClassifier()),
    # ("Decision Tree", DecisionTreeClassifier()),
    # ("Support Vector Classification", SVC()),
    # ("LightGBM", LGBMClassifier()),
    # ("KNN", KNeighborsClassifier(5, weights="uniform")),
    # ("Naive Bayes", GaussianNB()),
    # ("Neural Network", MLPClassifier()),
    # ("Quadratic Discriminant Analysis", QuadraticDiscriminantAnalysis()),
]

In [51]:
def get_feature_importance(model, X):
    try:
        mdi_importances = pd.Series(
            model.feature_importances_, index=X.columns
        ).sort_values(ascending=True)
        return mdi_importances
    except AttributeError:
        pass


def get_permutation_importance(model, X, y):
    result = permutation_importance(model, X, y, n_repeats=10, random_state=42)
    # Create a Series with feature names and their mean importances
    importances = pd.Series(result.importances_mean, index=X.columns)
    # Sort importances from most to least important
    sorted_importances = importances.sort_values(ascending=False)
    return sorted_importances

In [52]:
def get_results_all_models(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        average_precision = average_precision_score(y_test, y_pred)
        feature_importance = get_feature_importance(model, X_test)
        permutation_importance = get_permutation_importance(model, X_test, y_test)
        results[name] = {
            "accuracy": accuracy,
            "roc_auc": roc_auc,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "average_precision": average_precision,
            "feature_importance": feature_importance,
            "permutation_importance": permutation_importance,
            "columns": ",".join(X_test.columns.to_list()),
        }
    return results

In [53]:
def save_results(results, filename):
    results_to_be_saved = pd.DataFrame.from_dict(data=results, orient="index")
    results_to_be_saved.to_csv(path.join("..", "model_comparison_data", filename))

## Pipeline Level Functions

In [54]:
def convert_results_list_to_dataframe(results):
    results_df = pd.DataFrame.from_dict(data=results, orient="index")
    # data_frame = data_frame.append(results_df, ignore_index=True)
    pass

In [55]:
def get_new_test_train_sets(X, y, columns_to_include, test_size=0.2, random_state=42):
    X_dropped = X[columns_to_include]
    X_train, X_test, y_train, y_test = train_test_split(
        X_dropped, y, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

In [56]:
def run_pipeline(rules, models, X, y):
    rule_results = {}
    for rule in rules:
        rule_name = rule
        columns_to_include = rules[rule]
        X_train, X_test, y_train, y_test = get_new_test_train_sets(
            X, y, columns_to_include
        )
        results = get_results_all_models(models, X_train, X_test, y_train, y_test)
        rule_results[rule_name] = results
    return rule_results

In [57]:
def extract_columns_from_rule(rule, columns):
    rule_columns = [c for c in columns if re.fullmatch(rule, c)]
    return rule_columns

In [58]:
def get_rules_dict(rules, columns):
    rules_dict = {}
    for rule in rules:
        rule_name = rule
        rule_columns = extract_columns_from_rule(rule, columns)
        rules_dict[rule_name] = rule_columns
    return rules_dict

## Pipeline

In [59]:
genes = [r"HA", r"NA", r"M1", r"NS1", r"NP", r"PA", r"PB1", r"PB2"]
prefix = [
    r"\b([CTGA]){2}_",
    # r"\b([CTGA]){3}_",
    # r"\b([CTGA]){4}_",
    # r"\b([CTGA]){5}_",
    # r"\b([CTGA]){6}_",
    # r"\bDPC_.*_",
    # r"\bCTDC_.*_",
    # r"\bCTDD_.*_",
    # r"\bCTDT_.*_",
    # r"\bCTriad_.*_",
    # r"\bPAAC_.*_",
]
rules = [p + g for p in prefix for g in genes]

In [60]:
rules_dict = get_rules_dict(rules, X.columns)
rules_dict

{'\\b([CTGA]){2}_HA': ['CG_HA',
  'TC_HA',
  'CA_HA',
  'GA_HA',
  'GT_HA',
  'AC_HA',
  'CC_HA',
  'TT_HA',
  'CT_HA',
  'AT_HA'],
 '\\b([CTGA]){2}_NA': ['TT_NA',
  'GC_NA',
  'CA_NA',
  'GG_NA',
  'GT_NA',
  'TG_NA',
  'AC_NA',
  'AG_NA',
  'CC_NA',
  'AA_NA'],
 '\\b([CTGA]){2}_M1': ['AA_M1',
  'GC_M1',
  'CG_M1',
  'CT_M1',
  'TG_M1',
  'CC_M1',
  'GT_M1',
  'AC_M1',
  'TA_M1',
  'GA_M1'],
 '\\b([CTGA]){2}_NS1': ['GA_NS1',
  'CC_NS1',
  'AC_NS1',
  'GT_NS1',
  'AA_NS1',
  'TG_NS1',
  'CG_NS1',
  'CT_NS1',
  'TT_NS1',
  'AT_NS1'],
 '\\b([CTGA]){2}_NP': ['CC_NP',
  'CA_NP',
  'TA_NP',
  'TC_NP',
  'TG_NP',
  'GG_NP',
  'AT_NP',
  'GT_NP',
  'AC_NP',
  'CG_NP'],
 '\\b([CTGA]){2}_PA': ['AA_PA',
  'AC_PA',
  'TC_PA',
  'CA_PA',
  'AG_PA',
  'TT_PA',
  'GG_PA',
  'TA_PA',
  'GA_PA',
  'TG_PA'],
 '\\b([CTGA]){2}_PB1': ['CA_PB1',
  'TT_PB1',
  'GT_PB1',
  'CG_PB1',
  'GG_PB1',
  'TG_PB1',
  'GC_PB1',
  'AC_PB1',
  'AG_PB1',
  'TA_PB1'],
 '\\b([CTGA]){2}_PB2': ['GA_PB2',
  'CC_PB2',
  'GC_PB

In [61]:

all_rule_results = run_pipeline(rules_dict, models, X, y)

In [62]:
all_rule_results

{'\\b([CTGA]){2}_HA': {'XGBoost': {'accuracy': 0.9823393908369593,
   'roc_auc': np.float64(0.8210923933879077),
   'precision': 0.7307692307692307,
   'recall': 0.6495726495726496,
   'f1': 0.6877828054298643,
   'average_precision': np.float64(0.4851816906117865),
   'feature_importance': GT_HA    0.035541
   TT_HA    0.038490
   GA_HA    0.040188
   AT_HA    0.052798
   CA_HA    0.066613
   CG_HA    0.067579
   TC_HA    0.076414
   CT_HA    0.110518
   AC_HA    0.198739
   CC_HA    0.313120
   dtype: float32,
   'permutation_importance': AC_HA    0.010315
   CT_HA    0.009137
   CC_HA    0.008907
   TC_HA    0.007423
   CG_HA    0.007397
   CA_HA    0.005503
   GA_HA    0.005247
   GT_HA    0.004377
   AT_HA    0.003737
   TT_HA    0.003455
   dtype: float64,
   'columns': 'CG_HA,TC_HA,CA_HA,GA_HA,GT_HA,AC_HA,CC_HA,TT_HA,CT_HA,AT_HA'}},
 '\\b([CTGA]){2}_NA': {'XGBoost': {'accuracy': 0.9810596365497825,
   'roc_auc': np.float64(0.7748754031075932),
   'precision': 0.7471264367816092,