In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from collections import Counter
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, classification_report
from sklearn import tree
from statannot import add_stat_annotation
from itertools import combinations
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

# DataSet

In [3]:
##Original Datasets
df = pd.read_csv()
columns = df.columns
columns = [i.replace(" ", "_") for i in columns]
df.columns = columns
df = df.loc[df["Diagnosis_simplified"].isin(["HB", "HCC", "NOS"])]
subset["Diagnosis_simplified"] = df["Diagnosis_simplified"]
variables = subset.columns.tolist()[:-1]
variables_bx = []
variables_SR = []

array_split = list(map(lambda x: x.split("_"), variables))

for i in range(len(array_split)):
    if "bx" in array_split[i]:
        variables_bx.append(variables[i])
    elif "SR" in array_split[i]:
        variables_SR.append(variables[i])
if "Diagnosis_simplifed" not in variables_bx:
    variables_bx.append("Diagnosis_simplified")
df_nano_bx = df[variables_bx].dropna()


### Data anonymization

In [5]:
# Create an empty dictionary to store the original and new column names
dict_genes = {}

# Iterate over the columns of the DataFrame
for i, col in enumerate(df_nano_bx.columns):
    # Create the new column name as "BSC_BM" followed by the index
    new_col = f"BSC_BM{i+1}"
    # Add the original and new column names to the dictionary
    dict_genes[col] = new_col
    # Rename the column in the DataFrame
    df_nano_bx.rename(columns={col: new_col}, inplace=True)


### Functions

In [6]:
def Logistic_Regression_fun(X_data,Y_data):
    #Scale the data
    scaler = StandardScaler()
    # transform data
    X_fitted = scaler.fit_transform(X)
    # create loocv procedure
    cv = LeaveOneOut()
    # enumerate splits
    y_true, y_pred = list(), list()
    for train_ix, test_ix in cv.split(X):
        # split data
        X_train, X_test = X_fitted[train_ix, :], X_fitted[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]
        # fit model
        model = LogisticRegression(class_weight="balanced", solver='lbfgs', C = 0.1, penalty="l2", max_iter = 100000) 
        model.fit(X_train, y_train)
        # evaluate model
        yhat = model.predict(X_test)
        y_true.append(y_test[0])
        y_pred.append(yhat[0])
        # calculate F1 score
        score_f1 = f1_score(y_true,y_pred,average='weighted')
        score_precision = precision_score(y_true,y_pred,average='weighted', zero_division=0)
        score_recall = recall_score(y_true,y_pred,average='weighted', zero_division=0)

    return(score_f1, score_precision, score_recall, y_true, y_pred)
        
def combinations_fun(features):
    var_names_combs = []
    for j in range(len(features)+1):
        combs = combinations(features,j )
        for i in combs:
            var_names_combs.append(i)
    var_names_combs = var_names_combs[1:]
    for i in range(len(var_names_combs)):
        var_names_combs[i] =  [s.replace(")", "") for s in var_names_combs[i]]
        var_names_combs[i] =  [s.replace("(", "") for s in var_names_combs[i]]
    return(var_names_combs)

def confusion_matrix_fun(df):
    fig, axes = plt.subplots(2, 2, figsize=(10, 10), squeeze=False)
    axes_list = [[0, 0], [0, 1], [1, 0], [1, 1]]
    for i, num in zip(range(df.shape[0]), axes_list):
        cf_matrix = confusion_matrix(df["y_true"][i], df["y_pred"][i])
        group_counts = ["{0:0.0f}".format(value) for value in
                                cf_matrix.flatten()]
        group_percentages = ["{0:.2%}".format(value) for value in
                             cf_matrix.flatten()/np.sum(cf_matrix)]
        classes = sorted(list(Counter(y)))
        len_arr = len(classes)
        labels = [f"{v1}\n{v2}\n" for v1, v2 in
                  zip(group_counts,group_percentages)]
        labels = np.asarray(labels).reshape(len_arr,len_arr)
        matrix = sns.heatmap( cf_matrix, annot=labels, fmt='', cmap='Blues', ax=axes[num[0], num[1]])
        title = matrix.set_title(df["Biomarker"][i], fontsize=10);
        matrix.set_xlabel('\nPredicted Category')
        matrix.set_ylabel('Actual Category ');
        ## Ticket labels - List must be in alphabetical order
        matrix.xaxis.set_ticklabels(classes)
        matrix.yaxis.set_ticklabels(classes)
        title.set_y(1.2)
        fig.subplots_adjust(right =1.6, top=1.2) 
        plt.tight_layout()
        fig.tight_layout()
        plt.savefig('Confusion_matrix.jpg')

def boxplots_fun(bm_list):
    fig, axes = plt.subplots(2, 5, figsize=(20, 10))
    lista_prueba = [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]]
    for i,num in zip(bm_list, lista_prueba):
        a = df[["Diagnosis_simplified", i]]
        a = a.dropna()
        values = a["Diagnosis_simplified"].value_counts().to_string()
        values = values.replace("     ", ':') 
        values =  values.replace("\n", " " )
        label = "Diagnosis:  " +str(values)
        x_ordered = a["Diagnosis_simplified"].sort_values()
        ax = sns.boxplot(x=x_ordered,  y=i,  data=df, ax=axes[num[0], num[1]])
        #ax.set_xlabel(label)
        medians = a["Diagnosis_simplified"].value_counts()
        for xtick in ax.get_xticks():
            ax.text(xtick,medians[xtick],medians[xtick], 
                    horizontalalignment='center', verticalalignment="top", size='medium', in_layout=True, color='black',weight='semibold')
        add_stat_annotation(ax, data=df, x = "Diagnosis_simplified", y=i,
                            box_pairs=[("HB", "HCC"), ("HB", "NOS"),("HCC", "NOS")],
                            test='Mann-Whitney', text_format='star', loc='outside', verbose=2)

    plt.tight_layout()

### Top biomarkers. Feature ranking based on logistic regression

In [None]:
df_subsets = pd.DataFrame()
features_list = []
list_of_dicts = []
dictionary = {}
for var in tqdm(df_top_LFC.columns.tolist()[:-1]):
    a = df[["Diagnosis_simplified", var ]]
    a = a.dropna()
    X = a.drop('Diagnosis_simplified', axis=1)
    y = a['Diagnosis_simplified']
    #Convert to array
    X = X.to_numpy()
    y = y.to_numpy()
    y[(y == "NOS") | (y == "HCC")] = "Other"
    features_list.append([var, Logistic_Regression_fun(X, y)[0]])
df_top_bk = pd.DataFrame(features_list, columns=["Biomarker", "F1_score"])
top_bk = df_top_bk.sort_values(["F1_score"], ascending= False).head(10)
top_bk = top_bk["Biomarker"].tolist()

top_bk_combinations = combinations_fun(top_bk)


for x in tqdm(top_bk_combinations):
    if "Diagnosis_simplified" in x:
        x.remove("Diagnosis_simplified")
    x.append("Diagnosis_simplified")
    a = df[x]
    a = a.dropna()
    if a.shape[0] > 0:
        values = a["Diagnosis_simplified"].value_counts().to_string()
        values = values.replace("     ", ':') 
        values =  values.replace("\n", " " )
        N = len(a)
        X = a.drop('Diagnosis_simplified', axis=1)
        y = a['Diagnosis_simplified']
        name = a.drop('Diagnosis_simplified', axis=1).columns.tolist()
        name_bsc = [dict_genes[x] for x in name]
        #Convert to array
        X = X.to_numpy()
        y = y.to_numpy()
        y[(y == "NOS") | (y == "HCC")] = "Other"
    dictionary = {'Biomarker': name_bsc, 'f1_score': Logistic_Regression_fun(X,y)[0],\
                  'Precision_score': Logistic_Regression_fun(X,y)[1], \
                  'Recall_score': Logistic_Regression_fun(X,y)[2], \
                     "N": N, "Diagnosis":values, "y_true":Logistic_Regression_fun(X,y)[3],\
                  "y_pred":Logistic_Regression_fun(X,y)[4] }
    list_of_dicts.append(dictionary)

       
df_subsets = pd.DataFrame.from_dict(list_of_dicts)
df_subsets = df_subsets.sort_values(["f1_score"], ascending= False).head(10)
df_subsets[["Biomarker", "f1_score",'Precision_score', 'Recall_score',  "N", "Diagnosis"]]



### Confusion Matrix

In [None]:
df_subsets = df_subsets.reset_index()
df_subsets = df_subsets.drop("index", axis=1)
confusion_matrix_fun(df_subsets)