In [1]:
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler

np.random.seed(42)
random_state = 42

scaler = StandardScaler()

# Load dataset
data_ssgsea = pd.read_csv('ssgsea_10.csv')
X_ssgsea = data_ssgsea.iloc[:, 1:]
y_ssgsea = data_ssgsea.iloc[:, 0]

X_train_ssgsea_raw, X_test_ssgsea_raw, y_train_ssgsea, y_test_ssgsea = train_test_split(
    X_ssgsea, y_ssgsea, test_size=0.2, random_state=random_state
)

# Standardize the data
X_train_ssgsea = scaler.fit_transform(X_train_ssgsea_raw)
X_test_ssgsea = scaler.transform(X_test_ssgsea_raw)



  from pandas.core import (


In [2]:
scaler = StandardScaler()

data_ppi = pd.read_csv('ppi_10.csv')
X_ppi = data_ppi.iloc[:, 1:]
y_ppi = data_ppi.iloc[:, 0]

X_train_ppi_raw, X_test_ppi_raw, y_train_ppi, y_test_ppi = train_test_split(X_ppi, y_ppi, test_size=0.2, random_state = random_state)

X_train_ppi = scaler.fit_transform(X_train_ppi_raw)
X_test_ppi = scaler.transform(X_test_ppi_raw)



In [3]:
scaler = StandardScaler()

data_wgcna = pd.read_csv('wgcna_10.csv')
X_wgcna = data_wgcna.iloc[:, 1:]
y_wgcna = data_wgcna.iloc[:, 0]

X_train_wgcna_raw, X_test_wgcna_raw, y_train_wgcna, y_test_wgcna = train_test_split(X_wgcna, y_wgcna, test_size=0.2, random_state = random_state)

X_train_wgcna = scaler.fit_transform(X_train_wgcna_raw)
X_test_wgcna = scaler.transform(X_test_wgcna_raw)



In [4]:
import joblib
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from itertools import product

# Define models for each domain
ssgsea_models = ['logistic', 'gnb']
ppi_models = ['rf', 'svm']
wgcna_models = ['rf', 'svm']

# Model path template
model_path_template = "./basemodel/{domain}/{model}_{domain}.joblib"

# Load all models
models = {}

# Dynamically load models from all domains
for domain in ['ssgsea', 'ppi', 'wgcna']:
    for model_name in locals()[f'{domain}_models']:  # Get model list for each domain by variable name
        model_path = model_path_template.format(domain=domain, model=model_name)
        models[f'{model_name}_{domain}'] = joblib.load(model_path)

# Generate all combinations of models across domains
all_combinations = product(ssgsea_models, ppi_models, wgcna_models)

best_auc = 0
best_model = None
best_combination = None

# Iterate through each model combination
for combination in all_combinations:
    # Store predicted probabilities for each domain
    predictions_train = []
    predictions_test = []

    for domain, model_name in zip(['ssgsea', 'ppi', 'wgcna'], combination):
        model_key = f'{model_name}_{domain}'  # Combine domain and model name dynamically
        model = models[model_key]

        # Train the model and obtain predicted probabilities
        if domain == 'ssgsea':
            model.fit(X_train_ssgsea, y_train_ssgsea.squeeze())
            pred_train_ssgsea = model.predict_proba(X_train_ssgsea)[:, 1]
            pred_test_ssgsea = model.predict_proba(X_test_ssgsea)[:, 1]
        elif domain == 'ppi':
            model.fit(X_train_ppi, y_train_ppi.squeeze())
            pred_train_ppi = model.predict_proba(X_train_ppi)[:, 1]
            pred_test_ppi = model.predict_proba(X_test_ppi)[:, 1]
        elif domain == 'wgcna':
            model.fit(X_train_wgcna, y_train_wgcna.squeeze())
            pred_train_wgcna = model.predict_proba(X_train_wgcna)[:, 1]
            pred_test_wgcna = model.predict_proba(X_test_wgcna)[:, 1]

        # Append predictions from each domain
        predictions_train.append(locals()[f'pred_train_{domain}'])
        predictions_test.append(locals()[f'pred_test_{domain}'])

    # Combine predicted probabilities as new features
    X_train_combined = np.column_stack(predictions_train)
    X_test_combined = np.column_stack(predictions_test)

    # Train a logistic regression meta-model
    lr_model = LogisticRegression()
    lr_model.fit(X_train_combined, y_train_ssgsea.squeeze())  # Use ssgsea labels as ground truth

    # Predict using the trained meta-model
    y_pred_meta = lr_model.predict_proba(X_train_combined)[:, 1]

    # Compute AUC score
    auc_score = roc_auc_score(y_train_ssgsea.squeeze(), y_pred_meta)

    print(f"AUC score for combination {combination}: {auc_score}")

    # Update the best model
    if auc_score > best_auc:
        best_auc = auc_score
        best_model = lr_model
        best_combination = combination

# Save the best model
joblib.dump(best_model, 'best_model.joblib')

# Output the best combination and AUC score
print(f"The best model combination is: {best_combination}")
print(f"The best AUC score is: {best_auc}")


AUC score for combination ('logistic', 'rf', 'rf'): 0.984051724137931
AUC score for combination ('logistic', 'rf', 'svm'): 0.9616379310344827
AUC score for combination ('logistic', 'svm', 'rf'): 0.9599137931034483
AUC score for combination ('logistic', 'svm', 'svm'): 0.840948275862069
AUC score for combination ('gnb', 'rf', 'rf'): 0.9806034482758621
AUC score for combination ('gnb', 'rf', 'svm'): 0.9549568965517241
AUC score for combination ('gnb', 'svm', 'rf'): 0.9476293103448277
AUC score for combination ('gnb', 'svm', 'svm'): 0.8310344827586207
The best model combination is: ('logistic', 'rf', 'rf')
The best AUC score is: 0.984051724137931
