In [1]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from utils import (
    read_data, int_enc, shuffle_Xy, plot_confusion_matrix, 
    plot_roc_curves, plot_feature_analysis
)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
lr_model = Pipeline([
    ("pca", PCA(n_components=50)),
    ("lr", LogisticRegression(max_iter=50000))
])

configs = {
    "LR on GPL96": {
        "genes": "GPL96", 
        "model": lr_model
    },
    "LR on GPL570": {
        "genes": "GPL570", 
        "model": lr_model
    }
}

In [3]:
for name, config in configs.items():

    genes = config["genes"]
    model = config["model"]
    prefix = name.replace(" ", "")

    # prepare the dataset
    data = read_data(f"data/{genes}Restriction.csv")
    data, labels = int_enc(data)

    # grab the feature names
    feat = pd.read_csv(f"data/{genes}Restriction.csv", nrows=1)
    feat = feat.columns.tolist()
    feat.remove("label")

    X = data.drop(["label"], axis=1).values
    y = data["label"].values

    X = StandardScaler().fit_transform(X)
    X, y = shuffle_Xy(X, y)

    # train on whole set
    model.fit(X, y)

    V = model.named_steps["pca"].components_
    W = model.named_steps["lr"].coef_

    # contribution of each features to each class
    contribution = W @ V # n class x m feat.

    # for each feature, neglecting class
    total_contribution = np.sum(np.abs(contribution), axis=0)

    plot_feature_analysis(
        total_contribution, 
        feat, 
        title=f"{name} - Feature Analysis",
        path=f"results/{prefix}Features.png",
    )