In [None]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from mlxtend.evaluate import bootstrap
from numpy import format_float_scientific
import pandas as pd

def get_counts(df,LN=False):
    X=df[[col for col in X.columns if not col.startswith("interact")]]
    X=Normalizer('l1').fit_transform(X)
    X_interact=PolynomialFeatures(interaction_only=True).fit_transform(X)
    return X, X_interact

def get_intermingling(df,standard_scaler=None,LN=False):
    X=df[[col for col in X.columns if col.startswith("interact")]]
    if isinstance(standard_scaler,type(None)):
        standard_scaler=StandardScaler()
        X=standard_scaler.fit_transform(X)
    X_interact=PolynomialFeatures(interaction_only=True).fit_transform(X)
    return X, X_interact, standard_scaler

def return_predictions(X_train,X_test,y_train,y_test,C=0.01):
    lr=LogisticRegression(random_state=42,C=C,penalty='none',solver='saga').fit(X_train,y_train)
    return np.vstack([y_test,lr.predict_proba(X_test)[:,1]]).T

def auc(Y):
    y_true,y_pred=Y[:,0],Y[:,1]
    return roc_auc_score(y_true,y_pred)

def return_bootstrap_results(Y,fn,round_place=1,n_round=1000):
    original, std_err, ci_bounds = bootstrap(Y, num_rounds=n_round,
                                             func=fn,
                                             ci=0.95,
                                             seed=123)
    std_err=format_float_scientific(std_err,round_place)
    if float(std_err) >= 0.001:
        std_err=float(std_err)
    return "{}±{}".format(float(format_float_scientific(original,round_place)),std_err)

def get_auc(file):
    df=pd.read_csv(file,index_col=0).fillna(0)
    LN="LN" in file
    logo=LeaveOneGroupOut()
    logo.get_n_splits(groups=df['fold'])
    predictions={'X':[],'X_interact':[],'X_adj':[],"X_adj_interact":[],"all":[],"all_interact":[]}
    for train_index, test_index in logo.split(df, df['y'], df['fold']):
        df_train=df.iloc[train_index]
        df_test=df.iloc[test_index]
        X_count_train,X_interact_train=get_counts(df_train)
        X_adj_train,X_adj_interact_train,ss=get_intermingling(df_train,standard_scaler=None)
        X_count_test,X_interact_test=get_counts(df_test)
        X_adj_test,X_adj_interact_test,_=get_intermingling(df_test,standard_scaler=None)
        second_order=PolynomialFeatures(interaction_only=False).fit_transform(X)
        X_train=np.hstack([X_count_train,X_adj_train])
        X_interact_train=second_order.fit_transform(np.hstack([X_count_train,X_adj_train]))
        X_test=np.hstack([X_count_test,X_adj_test])
        X_interact_test=second_order.transform(np.hstack([X_count_test,X_adj_test]))
        y_train=df['y'].values[train_index]
        y_test=df['y'].values[test_index]

        # models
        predictions['X'].append(return_predictions(X_count_train,X_count_test,y_train,y_test))
        predictions['X_interact'].append(return_predictions(X_interact_train,X_interact_test,y_train,y_test))
        predictions['X_adj'].append(return_predictions(X_adj_train,X_adj_test,y_train,y_test))
        predictions['X_adj_interact'].append(return_predictions(X_count_train,X_count_test,y_train,y_test))
        predictions['all'].append(return_predictions(X_train,X_test,y_train,y_test))
        predictions['all_interact'].append(return_predictions(X_interact_train,X_interact_test,y_train,y_test))

    for k in predictions:
        predictions[k]=np.vstack(predictions[k])

    aucs={}
    for k in predictions:
        aucs[k]=return_bootstrap_results(predictions[k],auc,round_place=1)
    return aucs

In [None]:
get_auc("design.matrix.csv")