In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline 
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn import tree
from sklearn.cluster import SpectralBiclustering
import math


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [2]:
import sys
import logging

nblog = open("bic_prediction.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [3]:
tw = 4
dir ='lisbon'
df = pd.read_csv('../data/'+dir+'/conversion_ad/{}tw_no_norm.csv'.format(tw))
all_features = df.copy()
y = df['Evolution'].copy()
all_features.drop(columns = ['Code','Group', 'BBA', 'CSFdate', 'comentarios', 'Conversion', 'data', 'tempofollowup', 'Evolution'], inplace = True) # drop unwanted columns
all_features['sexo'].replace({'M' : 0, 'F': 1}, inplace = True)
numerical_features = [feature for feature in all_features.columns if feature not in ['sexo', 'Cluster']]
all_features

Unnamed: 0,idade,sexo,CSFdatatTau,CSFdatapTau,CSFdataabeta42,MMSE,Cluster,NRP2,APOA1,FETUA,...,B4GA1,KV127,NAR3,MYO6,MANBA,SODM,FIBG,CNTP4,HV349,A2AP
0,76,1,618,95,552,27,0,0.000155,0.00331,0.000652,...,0.00184,9.05e-06,7e-05,5.9e-05,0.00021,7.9e-05,0.00122,0.000135,0.000127,0.000954
1,77,0,563,77,349,30,1,2e-05,0.00434,0.000902,...,0.00201,1.46e-06,0.000161,3.4e-05,0.00125,4.3e-05,0.00257,0.000411,9.6e-05,0.00055
2,72,0,1201,135,399,22,0,3.1e-05,0.00223,0.000419,...,0.00258,6.54e-06,0.000165,3.7e-05,0.000333,0.000297,0.00173,0.000242,7.7e-05,0.000539
3,61,1,680,104,389,26,1,0.000145,0.00514,0.00117,...,0.00202,2.54e-05,0.000173,4.9e-05,0.00112,8.4e-05,0.0023,0.000868,0.000151,0.000663
4,63,1,200,26,324,22,1,1.2e-05,0.00313,0.000769,...,0.00154,7.52e-07,0.000116,0.000372,0.000826,0.000284,0.00124,0.000973,0.000117,0.000697
5,59,1,540,74,504,24,0,3.2e-05,0.0037,0.000844,...,0.00223,3.09e-06,0.000169,5.2e-05,0.000279,9.8e-05,0.00143,0.000432,0.000141,0.000695
6,74,1,404,68,855,23,1,1.9e-05,0.00487,0.00105,...,0.00229,9.84e-06,0.000239,2.9e-05,0.000466,7.4e-05,0.00277,0.000298,0.0001,0.00112
7,73,0,770,103,540,23,1,2e-05,0.00445,0.00107,...,0.00146,1.4e-06,0.000107,4.9e-05,0.000207,7.6e-05,0.00243,0.000317,0.00016,0.00113
8,61,1,1080,110,475,25,1,1.3e-05,0.00443,0.000885,...,0.00125,1.01e-05,9.1e-05,2.9e-05,0.000761,5.3e-05,0.00235,0.000205,0.000158,0.000899
9,71,0,702,86,558,26,0,1.3e-05,0.00451,0.000796,...,0.00193,8.91e-06,0.000169,3.8e-05,0.000262,7e-05,0.00231,0.000928,9.9e-05,0.000514


In [4]:
class DynamicSMOTENC(BaseEstimator):
    def __init__(self, sampling_strategy=0.95, random_state=42):
        self.sampling_strategy = sampling_strategy
        self.random_state = random_state
        self.smote = None

    def fit(self, X, y=None):
        
        # Identify categorical feature indices
        self.cat_indices = [i for i, column in enumerate(X.columns) if column in ['sexo', 'Cluster']]
    
        # Choose SMOTENC if there are categorical features, otherwise SMOTE
        if self.cat_indices:
            self.smote = SMOTENC(categorical_features=self.cat_indices, categorical_encoder = OneHotEncoder(sparse_output = False),
                                 sampling_strategy=self.sampling_strategy, 
                                 random_state=self.random_state)
            #self.smote.ohe_.set_params({'sparse_output': False})
        else:
            self.smote = SMOTE(sampling_strategy=self.sampling_strategy, 
                               random_state=self.random_state)
        
        # Fit the SMOTE/SMOTENC with X and y
        self.smote.fit(X, y)
        return self

    def fit_resample(self, X, y=None):
        self.cat_indices = [i for i, column in enumerate(X.columns) if column in ['sexo', 'Cluster']]
    
        # Choose SMOTENC if there are categorical features, otherwise SMOTE
        if self.cat_indices:
            self.smote = SMOTENC(categorical_features=self.cat_indices, categorical_encoder = OneHotEncoder(sparse_output = False),
                                 sampling_strategy=self.sampling_strategy, 
                                 random_state=self.random_state)
            #self.smote.ohe_.set_params({'sparse_output': False})

        else:
            self.smote = SMOTE(sampling_strategy=self.sampling_strategy, 
                               random_state=self.random_state)

        # Apply SMOTE/SMOTENC
        X_res, y_res = self.smote.fit_resample(X, y)
        #X_res = pd.DataFrame(X_res, columns=X.columns)
        return X_res, y_res


In [5]:
#classifiers to evaluate
# Define classifiers and their parameter grids
classifiers = {
    'LogisticRegression': {
        'model': LogisticRegression(solver='liblinear'),
        'params': {
            'classification__C': [0.1, 0.5, 1, 5, 10],
            'classification__class_weight' : [None, 'balanced'],
            'classification__penalty': ['l1', 'l2'], 'classification__random_state' : [42]
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'classification__n_estimators': [10,50, 100, 200],
            'classification__max_depth': [None, 10, 15,20,25],
            'classification__class_weight' : [None, 'balanced'], 'classification__random_state' : [42]
        }
    },
    'XGBgClassifier': {
        'model': XGBClassifier(),
        'params': {
            'classification__n_estimators': [10, 50, 100,300],
            'classification__max_depth': [5, 2,10,15,20,30],
            'classification__learning_rate': [0.0001, 0.001, 0.01, 0.1,1],
            'classification__objective': ['binary:logistic'], 'classification__random_state' : [42]
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'classification__C': [0.1, 0.5, 1, 10, 5, 20],
            'classification__kernel': ['linear','poly','rbf', 'sigmoid'],
            'classification__probability': [True],
            'classification__class_weight' : [None, 'balanced'], 'classification__random_state' : [42]
        }
    },
    'NB': {
        'model': ComplementNB(),
        'params': {'classification__force_alpha' : [True, False],
                  'classification__norm' : [True, False]}
    },
    'DTClassifier': {
    'model': tree.DecisionTreeClassifier(),
    'params': {'classification__max_depth': [None, 10, 15, 20,25],
              'classification__class_weight' : [None, 'balanced'],
              'classification__random_state' : [42]}
    }
}

In [6]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]


def sens(y_true, y_pred): return tp(y_true, y_pred) / \
    (fn(y_true, y_pred) + tp(y_true, y_pred))


def spec(y_true, y_pred): return tn(y_true, y_pred) / \
    (fp(y_true, y_pred) + tn(y_true, y_pred))

sensitivity_scorer = make_scorer(sens)

# Specificity scorer
specificity_scorer = make_scorer(spec)

# AUC scorer
auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# accuracy scorer
accuracy_scorer = make_scorer(accuracy_score)

In [7]:
class Biclustering(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters = 3, method = 'bistochastic',svd_method = 'randomized', random_state=42):
        self.n_clusters = n_clusters
        self.method = method
        self.svd_method = svd_method
        self.random_state = random_state
        self.biclusters = []
        
    def fit(self, X, y=None):
        # Clear biclusters to prevent accumulation
        self.biclusters = []
        
        self.biclustering = SpectralBiclustering(
            n_clusters=self.n_clusters, 
            method=self.method, 
            svd_method=self.svd_method, 
            random_state=self.random_state)
        self.biclustering.fit(X)
        self.post_processing_bicluster(X)
        self.filter_trivial()
        return self

    def get_number_bics(self):
        return len(self.biclusters)

    def filter_trivial(self):
        new_bic = []
        for b in range(len(self.biclusters)):
            if len(self.biclusters[b][0]) >= 2 and len(self.biclusters[b][1]) >= 2:
                new_bic.append(self.biclusters[b])
        self.biclusters = new_bic
        #print(self.biclusters)

    def post_processing_bicluster(self,X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        # Extracting the biclusters
        n_biclusters = max(list(self.biclustering.row_labels_) + list(self.biclustering.column_labels_)) + 1
        rows = []
        cols = []

        
        for n in range(n_biclusters):
            rows.append([])
            cols.append([])
        for i in range(len(self.biclustering.row_labels_)):
            rows[self.biclustering.row_labels_[i]].append(X.index[i])
        for j in range(len(self.biclustering.column_labels_)):
            cols[self.biclustering.column_labels_[j]].append(X.columns[j])

        # we create a list of biclusters    
        for i in range(n_biclusters):
            self.biclusters.append([rows[i],cols[i]])
        #print(self.biclusters)
    def transform(self, X,y = None):
        #print('Biclustering')
        #self.post_processing_bicluster(X)
        #self.filter_trivial()
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        d = {}
        for b in range(len(self.biclusters)):
            if b == 0:
                for row in X.index:
                    d[row] = []
            P = X.loc[self.biclusters[b][0],self.biclusters[b][1]]
            y_labels = P.columns
            P = P.mean(axis = 0)
            for i in X.index:
                a = (X.loc[i,y_labels] - P)**2
                a = np.sqrt(a.sum()) / len(P)
                d[i].append(a)
        #return P
        #print(pd.DataFrame(d, index = ["bic_" + str(i) for i in list(range(0,len(self.biclusters)))]).T)
        return pd.DataFrame(d, index = ["bic_" + str(i) for i in list(range(0,len(self.biclusters)))]).T, y

In [None]:
# Define the cross-validation procedure (5-fold cross-validation with 10 repetitions)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=42)
all_scores = []
k_scores = []
# List to store results
results = []
#print(all_features)
set_config(transform_output="pandas")
# Define the ColumnTransformer to apply MinMaxScaler only to numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),  # Custom scaler for numerical features
        ('cat', 'passthrough', ['sexo', 'Cluster'])            # Pass categorical columns unchanged
    ], remainder='passthrough', verbose_feature_names_out=False, sparse_threshold=0)     #remainder='passthrough'  # Ensure other columns are passed through if not specified


for name, clf in classifiers.items():
    print(name)
    pipeline = ImbPipeline([('scaler', preprocessor),
                            ('SMOTE', DynamicSMOTENC(sampling_strategy=0.8, random_state = 42)),
                            ('biclustering', Biclustering()),
                            ('classification', clf['model'])])
    
    bic_params = {'biclustering__n_clusters' : list(range(3,36)), 
                  'biclustering__method' : ['bistochastic', 'scale', 'log'],
                  'biclustering__svd_method' : ['randomized', 'arpack']}
    
    clf['params'].update(bic_params)  

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(pipeline, clf['params'], cv=cv, scoring={'AUC': auc_scorer, 'Sensitivity': sensitivity_scorer, 'Specificity': specificity_scorer, 'Accuracy': accuracy_scorer}, refit = 'AUC', n_jobs=-1)
    grid_search.fit(all_features, y)
    
    # Get the index of the best model (based on AUC score)
    best_index = grid_search.best_index_
     # Extract the sensitivity and specificity for the best model
    best_sensitivity = grid_search.cv_results_['mean_test_Sensitivity'][best_index]
    best_std_sensitivity = grid_search.cv_results_['std_test_Sensitivity'][best_index]
    best_specificity = grid_search.cv_results_['mean_test_Specificity'][best_index]
    best_std_specificity = grid_search.cv_results_['std_test_Specificity'][best_index]
    best_accuracy = grid_search.cv_results_['mean_test_Accuracy'][best_index]
    best_std_accuracy = grid_search.cv_results_['std_test_Accuracy'][best_index]
    # Get the best estimator (pipeline) from the grid search
    best_model = grid_search.best_estimator_
   
    # Store results
    results.append({
    'classifier': name,
    'best_params': grid_search.best_params_,
    'best_auc': grid_search.best_score_,
    'best_std_auc' :  grid_search.cv_results_['std_test_AUC'][best_index],
    'best_sensitivity': best_sensitivity,
    'best_std_sensitivity': best_std_sensitivity,
    'best_specificity': best_specificity,
    'best_std_specificity': best_std_specificity,
    'best_accuracy': best_accuracy,
    'best_std_accuracy': best_std_accuracy,
    })
    print('best_std_auc :',  grid_search.cv_results_['std_test_AUC'][best_index])

# Find the best classifier based on accuracy
best_result = max(results, key=lambda x: x['best_auc'])
best = f"Best Classifier: {best_result['classifier']} \n" + f"Number of features: {best_result['#features']}\n" + "Features:" + str(best_result['features']) +"\n" + \
f"Best Params: {best_result['best_params']}" +"\n" + f"Best AUC: {best_result['best_auc']:.4f}" +"\n" + f"Best Sens: {best_result['best_sensitivity']:.4f}" +"\n"+ \
f"Best Spec: {best_result['best_specificity']:.4f}" + "\n"  + f"Best Accuracy: {best_result['best_accuracy']:.4f}" + "\n"


# Create the LaTeX table as a string
latex_table = f"""
\\begin{{table}}[htbp]
\\centering
\\caption{{Best Classifier Performance and Selected Features}}
\\begin{{tabular}}{{|l|l|}}
\\hline
\\textbf{{Best Classifier}} & {best_result['classifier']} \\\\
\\hline
\\textbf{{Classifier Parameters}} & {best_result['best_params']} \\\\
\\hline
\\textbf{{Number of Features}} & {best_result['#features']} \\\\
\\hline
\\textbf{{Feature List}} & {', '.join(best_result['features'])} \\\\
\\hline
\\textbf{{Best AUC}} & {best_result['best_auc']:.2f} $\\pm$ {best_result['best_std_auc']:.2f} \\\\
\\hline
\\textbf{{Best Sensitivity}} & {best_result['best_sensitivity']:.2f} $\\pm$ {best_result['best_std_sensitivity']:.2f} \\\\
\\hline
\\textbf{{Best Specificity}} & {best_result['best_specificity']:.2f} $\\pm$ {best_result['best_std_specificity']:.2f} \\\\
\\hline
\\textbf{{Best Accuracy}} & {best_result['best_accuracy']:.2f} $\\pm$ {best_result['best_std_accuracy']:.2f} \\\\
\\hline
\\end{{tabular}}
\\label{{tab:best_classifier}}
\\end{{table}}
"""
print(best)
#print(latex_table)

with open(f'{tw}tw_biclustering.txt', 'w') as f:
    f.write(latex_table)

LogisticRegression
