In [10]:
import pandas as pd
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline 
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV, LeaveOneOut
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn import tree
from sklearn.cluster import SpectralBiclustering
import math
from collections import defaultdict
from sklearn.model_selection import train_test_split


In [11]:
import sys
import logging

nblog = open("bic_prediction.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [12]:
tw = 2
dir ='lisbon'
df = pd.read_csv('../data/'+dir+'/conversion_ad/{}tw_no_norm.csv'.format(tw))
all_features = df.copy()
y = df['Evolution'].copy()
all_features.drop(columns = ['Code','Group', 'BBA', 'CSFdate', 'comentarios', 'Conversion', 'data', 'tempofollowup', 'Evolution', 'Cluster'], inplace = True) # drop unwanted columns
all_features['sexo'].replace({'M' : 0, 'F': 1}, inplace = True)
numerical_features = [feature for feature in all_features.columns if feature not in ['sexo']]
all_features

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_features['sexo'].replace({'M' : 0, 'F': 1}, inplace = True)
  all_features['sexo'].replace({'M' : 0, 'F': 1}, inplace = True)


Unnamed: 0,idade,sexo,CSFdatatTau,CSFdatapTau,CSFdataabeta42,MMSE,NRP2,APOA1,FETUA,A1AG1,...,B4GA1,KV127,NAR3,MYO6,MANBA,SODM,FIBG,CNTP4,HV349,A2AP
0,76,1,618,95,552,27,0.000155,0.00331,0.000652,0.000652,...,0.00184,9.05e-06,7e-05,5.9e-05,0.00021,7.9e-05,0.00122,0.000135,0.000127,0.000954
1,77,0,563,77,349,30,2e-05,0.00434,0.000902,0.000974,...,0.00201,1.46e-06,0.000161,3.4e-05,0.00125,4.3e-05,0.00257,0.000411,9.6e-05,0.00055
2,72,0,1201,135,399,22,3.1e-05,0.00223,0.000419,0.000876,...,0.00258,6.54e-06,0.000165,3.7e-05,0.000333,0.000297,0.00173,0.000242,7.7e-05,0.000539
3,61,1,680,104,389,26,0.000145,0.00514,0.00117,0.00147,...,0.00202,2.54e-05,0.000173,4.9e-05,0.00112,8.4e-05,0.0023,0.000868,0.000151,0.000663
4,63,1,200,26,324,22,1.2e-05,0.00313,0.000769,0.00166,...,0.00154,7.52e-07,0.000116,0.000372,0.000826,0.000284,0.00124,0.000973,0.000117,0.000697
5,59,1,540,74,504,24,3.2e-05,0.0037,0.000844,0.000875,...,0.00223,3.09e-06,0.000169,5.2e-05,0.000279,9.8e-05,0.00143,0.000432,0.000141,0.000695
6,74,1,404,68,855,23,1.9e-05,0.00487,0.00105,0.0011,...,0.00229,9.84e-06,0.000239,2.9e-05,0.000466,7.4e-05,0.00277,0.000298,0.0001,0.00112
7,73,0,770,103,540,23,2e-05,0.00445,0.00107,0.00171,...,0.00146,1.4e-06,0.000107,4.9e-05,0.000207,7.6e-05,0.00243,0.000317,0.00016,0.00113
8,61,1,1080,110,475,25,1.3e-05,0.00443,0.000885,0.00161,...,0.00125,1.01e-05,9.1e-05,2.9e-05,0.000761,5.3e-05,0.00235,0.000205,0.000158,0.000899
9,71,0,702,86,558,26,1.3e-05,0.00451,0.000796,0.00117,...,0.00193,8.91e-06,0.000169,3.8e-05,0.000262,7e-05,0.00231,0.000928,9.9e-05,0.000514


In [13]:
#classifiers to evaluate
# Define classifiers and their parameter grids
classifiers = {
    'LogisticRegression': {
        'model': LogisticRegression(solver='liblinear'),
        'params': {
            'classification__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'classification__class_weight' : [None, 'balanced'],
            'classification__penalty': ['l1', 'l2'], 'classification__random_state' : [42]
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'classification__n_estimators': [10,50, 100, 200],
            'classification__max_depth': [None, 10, 15,20],
            'classification__max_features' : ['sqrt', 'log2',None],
            'classification__class_weight' : [None, 'balanced'], 'classification__random_state' : [42]
        }
    },
    'XGBgClassifier': {
        'model': XGBClassifier(),
        'params': {
            'classification__n_estimators': [10, 50, 100,300],
            'classification__max_depth': [5,10,20,30],
            'classification__learning_rate': [0.0001, 0.001, 0.01, 0.1,1],
            'classification__objective': ['binary:logistic'], 'classification__random_state' : [42]
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'classification__C': [0.1, 1, 10, 5],
            'classification__kernel': ['linear','poly','rbf', 'sigmoid'],
            'classification__probability': [True],
            'classification__class_weight' : [None, 'balanced'], 'classification__random_state' : [42]
        }
    },
    'NB': {
        'model': GaussianNB(),
        'params': {'classification__var_smoothing' : [10**-9, 10**-8, 10**-10]}
    },
    'DTClassifier': {
    'model': tree.DecisionTreeClassifier(),
    'params': {'classification__max_depth': [None, 10, 20],
              'classification__class_weight' : [None, 'balanced'],
              'classification__random_state' : [42]}
    }
}

In [14]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]


def sens(y_true, y_pred): return tp(y_true, y_pred) / \
    (fn(y_true, y_pred) + tp(y_true, y_pred))


def spec(y_true, y_pred): return tn(y_true, y_pred) / \
    (fp(y_true, y_pred) + tn(y_true, y_pred))

sensitivity_scorer = make_scorer(sens)

# Specificity scorer
specificity_scorer = make_scorer(spec)

# AUC scorer
auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# accuracy scorer
accuracy_scorer = make_scorer(accuracy_score)



In [23]:
class Biclustering(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters = 3, method = 'bistochastic',svd_method = 'randomized', random_state=42):
        self.n_clusters = n_clusters
        self.method = method
        self.svd_method = svd_method
        self.random_state = random_state
        self.biclusters = []
    def fit(self, X, y):
        print('ola')
        # Clear biclusters to prevent accumulation
        self.biclusters = []
        self.x_train = X
        self.biclustering = SpectralBiclustering(
            n_clusters=self.n_clusters, 
            method=self.method, 
            svd_method=self.svd_method, 
            random_state=self.random_state)
        
        #biclusters of the positive class
        print(y)
        positive_indices = [i for i, val in enumerate(y) if val > 0]
        print(positive_indices)
        print(X.loc[positive_indices, :])
        self.biclustering.fit(X.loc[positive_indices, :])
        positive_biclusters = self.post_processing_bicluster(X[positive_indices, :])
        positive_biclusters = self.filter_trivial(positive_biclusters)

        # biclusters of the negative class
        negative_indices = [i for i, val in enumerate(y) if val == 0]
        self.biclustering.fit(X.loc[negative_indices, :])
        negative_indices = self.post_processing_bicluster(X[negative_indices, :])
        negative_indices = self.filter_trivial(negative_indices)
        # concat biclusters
        self.biclusters = positive_biclusters + negative_indices
        print(self.biclusters)
        # Check if biclusters were created
        if len(self.biclusters) == 0:
            print("Warning: No biclusters were found.")
        else:
            print(f"Found {len(self.biclusters)} biclusters.")
        return self

    def get_number_bics(self):
        return len(self.biclusters)

    def filter_trivial(self, biclusters):
        print(biclusters)
        return [bic for bic in biclusters if len(bic[0]) >= 2 and len(bic[1]) >= 2]
        #print(self.biclusters)

    def post_processing_bicluster(self,X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        # Extracting the biclusters
        n_biclusters = max(list(self.biclustering.row_labels_) + list(self.biclustering.column_labels_)) + 1
        rows = defaultdict(list)
        cols = defaultdict(list)

        row_labels = self.biclustering.row_labels_
        col_labels = self.biclustering.column_labels_

        for i, label in enumerate(row_labels):
           rows[label].append(X.index[i])

        for j, label in enumerate(col_labels):
            cols[label].append(X.columns[j])
        
        # Build biclusters
        return [[rows[i], cols[i]] for i in range(n_biclusters)]
        
        
    def transform(self, X):
        
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        # Initialize dictionary for distances

        distance_matrix = []
        if len(self.biclusters) == 0:
            print('Warning: No biclusters found, returning zero matrix')
            return np.zeros(((X.shape[0], 1)))
        #col_name = []
        for b in range(len(self.biclusters)):
            P = self.x_train.loc[self.biclusters[b][0],self.biclusters[b][1]] #patterns in the train set
            # Check if P is empty or has NaNs
            if P.empty or P.isnull().values.any():
                continue
            y_labels = P.columns
            P = P.mean(axis = 0)
            if len(P) == 0:
                continue
            # Precompute row submatrix 
            X_submatrix = X.loc[:, y_labels]
            # Make sure shapes match before subtraction
            if X_submatrix.shape[1] != len(P):
                continue  # Skip if shapes are incompatible
            # Compute the distance for each row
            diff = (X_submatrix - P) ** 2
            distance_matrix.append(np.sqrt(diff.sum(axis=1)) / len(P))
            #col_name.append('bic_' + str(b)) 

        if len(distance_matrix) == 0:
            print("Warning: No valid distances computed, returning zero matrix")
            return np.zeros(((X.shape[0], 1)))
        distance_matrix = np.array(distance_matrix).T
        
        if len(distance_matrix[1]) == 1: #if one feature
            distance_matrix = distance_matrix.reshape(-1, 1)
        #distance_matrix = pd.DataFrame(distance_matrix, index = col_name).T
        
        return distance_matrix
    
    def fit_transform(self, X, y):
        return self.fit(X,y).transform(X)
    
    def get_biclusters(self):
        # Initialize an empty list to store each bicluster's data as a row
        bicluster_rows = []
        
        for b in range(len(self.biclusters)):
            # Dictionary to hold data for the current bicluster
            bicluster_data = {'ID': b, 'Pattern': [], 'n_samples': []}

            # Loop through the features (columns) for this bicluster
            for feature in self.biclusters[b][1]:
                # Get the data for the rows and current feature in the bicluster
                feature_values = self.x_train.loc[self.biclusters[b][0], feature]
                
                # Calculate the min and max values
                min_val = feature_values.min()
                max_val = feature_values.max()
                
                # Append the pattern in the format 'feature = [min, max]'
                bicluster_data['Pattern'].append(f'{feature} = [{min_val:.2f}, {max_val:.2f}]')
            bicluster_data['n_samples'].append(len(feature_values))
            # Append the bicluster data to the list of rows
            bicluster_rows.append(bicluster_data)
        
        # Convert the list of bicluster data into a DataFrame
        bicluster_table = pd.DataFrame(bicluster_rows)
        
        # Display the table
        print(bicluster_table)
        return bicluster_table

In [24]:
# Define the cross-validation procedure (5-fold cross-validation with 10 repetitions)
#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=42)
cv = LeaveOneOut()
all_scores = []
k_scores = []
# List to store results
results = []
#print(all_features)
set_config(transform_output="pandas")
# Define the ColumnTransformer to apply MinMaxScaler only to numerical columns
#preprocessor = ColumnTransformer(
#    transformers=[
#        ('num', MinMaxScaler(), numerical_features),  # Custom scaler for numerical features
#        ('cat', 'passthrough', ['sexo'])            # Pass categorical columns unchanged
#    ], remainder='passthrough', verbose_feature_names_out=False, sparse_threshold=0)     #remainder='passthrough'  # Ensure other columns are passed through if not specified


for name, clf in classifiers.items():
    print(name)
    pipeline = ImbPipeline([#('scaler', preprocessor),
                            ('biclustering', Biclustering()),
                            ('classification', clf['model'])])
    
    bic_params = {'biclustering__n_clusters' : list(range(2,36)), 
                  'biclustering__method' : ['bistochastic', 'scale', 'log'],
                  'biclustering__svd_method' : ['randomized', 'arpack']}
    
    clf['params'].update(bic_params)  

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(pipeline, clf['params'], cv=cv, scoring={'AUC': auc_scorer, 'Sensitivity': sensitivity_scorer, 'Specificity': specificity_scorer, 'Accuracy': accuracy_scorer}, refit = 'AUC', n_jobs=-1)
    grid_search.fit(all_features, y)
    # Get the index of the best model (based on AUC score)
    best_index = grid_search.best_index_
     # Extract the sensitivity and specificity for the best model
    best_sensitivity = grid_search.cv_results_['mean_test_Sensitivity'][best_index]
    best_std_sensitivity = grid_search.cv_results_['std_test_Sensitivity'][best_index]
    best_specificity = grid_search.cv_results_['mean_test_Specificity'][best_index]
    best_std_specificity = grid_search.cv_results_['std_test_Specificity'][best_index]
    best_accuracy = grid_search.cv_results_['mean_test_Accuracy'][best_index]
    best_std_accuracy = grid_search.cv_results_['std_test_Accuracy'][best_index]
    # Get the best estimator (pipeline) from the grid search
    best_model = grid_search.best_estimator_
   
    # Store results
    results.append({
    'classifier': name,
    'best_params': grid_search.best_params_,
    'best_auc': grid_search.best_score_,
    'best_std_auc' :  grid_search.cv_results_['std_test_AUC'][best_index],
    'best_sensitivity': best_sensitivity,
    'best_std_sensitivity': best_std_sensitivity,
    'best_specificity': best_specificity,
    'best_std_specificity': best_std_specificity,
    'best_accuracy': best_accuracy,
    'best_std_accuracy': best_std_accuracy,
    })
    print('best_auc :',  grid_search.best_score_)

# Find the best classifier based on accuracy
best_result = max(results, key=lambda x: x['best_auc'])
best = f"Best Classifier: {best_result['classifier']} \n" + \
f"Best Params: {best_result['best_params']}" +"\n" + f"Best AUC: {best_result['best_auc']:.4f}" +"\n" + f"Best Sens: {best_result['best_sensitivity']:.4f}" +"\n"+ \
f"Best Spec: {best_result['best_specificity']:.4f}" + "\n"  + f"Best Accuracy: {best_result['best_accuracy']:.4f}" + "\n"


# Create the LaTeX table as a string
latex_table = f"""
\\begin{{table}}[htbp]
\\centering
\\caption{{Best Classifier Performance and Biclustering {tw}tw}}
\\begin{{tabular}}{{|l|l|}}
\\hline
\\textbf{{Best Classifier}} & {best_result['classifier']} \\\\
\\hline
\\textbf{{Classifier Parameters}} & {best_result['best_params']} \\\\
\\hline
\\textbf{{Best AUC}} & {best_result['best_auc']:.2f} $\\pm$ {best_result['best_std_auc']:.2f} \\\\
\\hline
\\textbf{{Best Sensitivity}} & {best_result['best_sensitivity']:.2f} $\\pm$ {best_result['best_std_sensitivity']:.2f} \\\\
\\hline
\\textbf{{Best Specificity}} & {best_result['best_specificity']:.2f} $\\pm$ {best_result['best_std_specificity']:.2f} \\\\
\\hline
\\textbf{{Best Accuracy}} & {best_result['best_accuracy']:.2f} $\\pm$ {best_result['best_std_accuracy']:.2f} \\\\
\\hline
\\end{{tabular}}
\\label{{tab:best_classifier}}
\\end{{table}}
"""
print(best)
#print(latex_table)

with open(f'{tw}tw_biclustering.txt', 'w') as f:
    f.write(latex_table)

LogisticRegression


KeyboardInterrupt: 

In [79]:
set_config(transform_output="pandas")
for k in range(2,6):
    print(k)
    train_set = pd.read_csv('../data/lisbon/conversion_ad/{}tw_no_norm.csv'.format(k))
    y_train = train_set['Evolution'].copy()
    train_set.drop(columns = ['Code','Group', 'BBA', 'CSFdate', 'comentarios', 'Conversion', 'data', 'tempofollowup', 'Evolution', 'Cluster'], inplace = True) # drop unwanted columns
    train_set.loc[:, 'sexo'] = train_set['sexo'].replace({'M' : 0, 'F': 1})
    #X_train, X_test, y_train, y_test = train_test_split(train_set, y_train, test_size=0.2, random_state=42)
    X_train = train_set	
    test_set = pd.read_csv('../data/coimbra/conversion_ad/{}tw_no_norm.csv'.format(k))
    y_test = test_set['Evolution'].copy()
    X_test = test_set[train_set.columns]
    X_test.loc[:, 'sexo'] = X_test['sexo'].replace({'M' : 0, 'F': 1})
    numerical_features = [i for i in X_train.columns if i !='Sexo']

    #test_set['sexo'].replace({'M' : 0, 'F': 1}, inplace = True)

    if k == 2:
        biclustering = Biclustering(method='bistochastic', n_clusters = 32, svd_method='arpack')
        clf = RandomForestClassifier(class_weight=None, max_depth = None, max_features = 'log2', n_estimators = 50, random_state=42)
        k_neighbors = 5
    elif k == 3:
        biclustering = Biclustering(method='bistochastic', n_clusters = 20, svd_method='randomized')
        clf = XGBClassifier(learning_rate=1, max_depth = 5,n_estimators = 300, objective = 'binary:logistic', random_state=42)
        k_neighbors = 5
    elif k == 4: 
        biclustering = Biclustering(method='log', n_clusters = 33, svd_method='randomized')
        clf = XGBClassifier(learning_rate=1, max_depth = 5,n_estimators = 50, objective = 'binary:logistic', random_state=42)
        k_neighbors = 5
    elif k == 5:
        biclustering = Biclustering(method='log', n_clusters = 32, svd_method='arpack')
        clf = RandomForestClassifier(class_weight=None, max_depth = 10, max_features = 'log2', n_estimators = 10, random_state=42)
        k_neighbors = 3

    smote = SMOTENC(k_neighbors=k_neighbors, sampling_strategy=0.8,categorical_encoder = OneHotEncoder(sparse_output = False), categorical_features = ['sexo'], random_state = 42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = MinMaxScaler()
    X_train[numerical_features]= scaler.fit_transform(X_train[numerical_features])
    X_test[numerical_features] = scaler.transform(X_test[numerical_features])

    X_train = biclustering.fit_transform(X_train)
    X_test = biclustering.transform(X_test)
    bicluster_table = biclustering.get_biclusters()

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    sensitivity = sens(y_test, y_pred)
    specificity = spec(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    results_str = f'AUC: {auc} \n Sensitivity: {sensitivity} \n Specificity: {specificity} \n Accuracy: {accuracy} \n'
    print(results_str)
    latex_table = f"""
        \\begin{{table}}[htbp]
        \\centering
        \\caption{{Best Classifier Performance and Biclustering {k}tw}}
        \\begin{{tabular}}{{|l|l|}}
        \\hline
        \\textbf{{AUC}} & {auc:.2f}\\\\
        \\hline
        \\textbf{{Sensitivity}} & {sensitivity:.2f} \\\\
        \\hline
        \\textbf{{Specificity}} & {specificity:.2f} \\\\
        \\hline
        \\textbf{{Accuracy}} & {accuracy:.2f} \\\\
        \\hline
        \\end{{tabular}}
        \\label{{tab:best_classifier}}
        \\end{{table}}
        """
    with open(f'{k}tw_biclustering_coimbra_test.txt', 'w') as f:
        f.write(latex_table)
    with open(f'{k}tw_biclusters.txt', 'w') as f:
        f.write(bicluster_table.to_latex(index=False))

2


  train_set.loc[:, 'sexo'] = train_set['sexo'].replace({'M' : 0, 'F': 1})
  X_test.loc[:, 'sexo'] = X_test['sexo'].replace({'M' : 0, 'F': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numerical_features] = scaler.transform(X_test[numerical_features])


Found 11 biclusters.
    ID                                            Pattern n_samples
0    0  [CSFdataabeta42 = [0.24, 0.50], SLIK4 = [0.09,...       [3]
1    1  [HRG = [0.29, 1.00], CERU = [0.59, 1.00], FHR1...       [3]
2    2  [ALS = [0.44, 0.45], SE6L1 = [0.67, 0.72], CO9...       [5]
3    3  [ZA2G = [0.15, 0.15], LTBP1 = [1.00, 1.00], FS...       [2]
4    4  [C1QT1 = [0.58, 0.60], TPP1 = [0.18, 0.19], C1...       [2]
5    5  [OMD = [0.02, 0.53], SCG1 = [0.13, 0.27], MA1C...       [2]
6    6  [PLXB2 = [0.04, 0.15], GRIA4 = [0.21, 1.00], P...       [2]
7    7  [idade = [0.67, 0.67], MMSE = [0.11, 0.11], AP...       [2]
8    8         [CLC11 = [0.06, 0.06], VWF = [0.04, 0.04]]       [2]
9    9  [C1RL = [0.10, 0.27], FHR2 = [0.06, 0.10], CHR...       [2]
10  10  [NEO1 = [0.18, 0.61], VGF = [0.11, 0.30], GLU2...       [3]
AUC: 0.39560439560439564 
 Sensitivity: 0.0 
 Specificity: 1.0 
 Accuracy: 0.35 

3


  train_set.loc[:, 'sexo'] = train_set['sexo'].replace({'M' : 0, 'F': 1})
  X_test.loc[:, 'sexo'] = X_test['sexo'].replace({'M' : 0, 'F': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numerical_features] = scaler.transform(X_test[numerical_features])


Found 11 biclusters.
    ID                                            Pattern n_samples
0    0  [PLXB2 = [0.30, 1.00], ALS = [0.46, 1.00], PON...       [3]
1    1  [ZA2G = [0.07, 0.18], CO6 = [0.07, 0.50], ACTS...       [2]
2    2  [A1AG1 = [0.25, 0.26], ANT3 = [0.79, 0.80], OM...       [2]
3    3  [LFNG = [0.06, 0.07], HBB = [0.31, 0.32], HBA ...       [2]
4    4  [KCC2A = [0.18, 0.64], PEBP1 = [0.40, 0.68], P...       [4]
5    5  [IBP7 = [0.66, 0.87], GLU2B = [0.38, 0.40], SA...       [2]
6    6  [KV37 = [0.31, 0.54], A1BG = [0.64, 0.65], GDI...       [2]
7    7  [sexo = [0.00, 1.00], GRIA4 = [0.08, 0.53], SP...       [4]
8    8  [COCA1 = [0.00, 0.61], KLKB1 = [0.03, 0.49], C...       [4]
9    9  [A2GL = [0.25, 0.53], NID1 = [0.26, 0.43], EGF...       [3]
10  10  [APOA1 = [0.54, 0.78], APOA4 = [0.74, 1.00], A...       [2]
AUC: 0.421875 
 Sensitivity: 0.3125 
 Specificity: 0.75 
 Accuracy: 0.4 

4


  train_set.loc[:, 'sexo'] = train_set['sexo'].replace({'M' : 0, 'F': 1})
  X_test.loc[:, 'sexo'] = X_test['sexo'].replace({'M' : 0, 'F': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numerical_features] = scaler.transform(X_test[numerical_features])


Found 11 biclusters.
    ID                                            Pattern n_samples
0    0  [NID1 = [0.44, 0.48], DCC = [0.05, 0.09], EFNB...       [2]
1    1  [ANT3 = [0.78, 0.87], ICAM5 = [0.19, 0.61], KN...       [2]
2    2  [LFNG = [0.08, 0.08], FSTL5 = [0.75, 0.79], FR...       [2]
3    3  [ZA2G = [0.44, 0.44], LTBP1 = [0.35, 0.35], DH...       [2]
4    4  [COMP = [0.16, 0.64], PLXB2 = [0.33, 0.43], AL...       [2]
5    5  [MMSE = [0.44, 0.44], OMD = [0.16, 0.19], KV10...       [2]
6    6  [APOA4 = [0.32, 0.45], TTHY = [0.19, 0.33], PG...       [2]
7    7  [CSFdataabeta42 = [0.05, 0.07], EGFLA = [0.09,...       [2]
8    8  [GRIA4 = [0.58, 0.88], C1QT1 = [0.35, 0.43], C...       [2]
9    9  [CERU = [0.31, 0.62], HEMO = [0.21, 0.52], A1B...       [2]
10  10  [FETUA = [0.43, 0.58], SORL = [0.13, 0.41], C1...       [2]
AUC: 0.6111111111111112 
 Sensitivity: 1.0 
 Specificity: 0.0 
 Accuracy: 0.9 

5


  train_set.loc[:, 'sexo'] = train_set['sexo'].replace({'M' : 0, 'F': 1})
  X_test.loc[:, 'sexo'] = X_test['sexo'].replace({'M' : 0, 'F': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numerical_features] = scaler.transform(X_test[numerical_features])


Found 9 biclusters.
   ID                                            Pattern n_samples
0   0  [ANT3 = [0.34, 0.40], CERU = [0.00, 0.63], CBP...       [5]
1   1  [APOL1 = [0.08, 0.49], ZA2G = [0.16, 0.22], PL...       [3]
2   2  [VGF = [0.20, 0.22], ACTS = [0.83, 0.84], DSG2...       [3]
3   3  [idade = [0.05, 0.67], MMSE = [0.11, 0.44], NR...       [3]
4   4  [KV106 = [0.24, 0.24], CBLN4 = [0.49, 0.49], F...       [2]
5   5  [APOA4 = [0.74, 1.00], COCA1 = [0.34, 0.38], E...       [2]
6   6  [IBP6 = [0.99, 1.00], MDHC = [0.76, 0.77], CBP...       [2]
7   7        [CLC11 = [0.03, 0.35], ALBU = [0.43, 0.96]]       [4]
8   8  [A2GL = [0.37, 1.00], SLIK4 = [0.33, 0.58], CA...       [2]
AUC: 0.4473684210526315 
 Sensitivity: 0.8947368421052632 
 Specificity: 0.0 
 Accuracy: 0.85 

