Random Forest

In [3]:
import os
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef, roc_auc_score, average_precision_score

# Parameters for Random Forest
nums_estimators = [50]
criteria = ['gini']
methods_max_features = ['sqrt']

# Read inputs for labels, indices, and total matrices
data_dir = '../Data/'
labels_file_path = data_dir + '31_Y_ratio1.txt'
combinations_file_path = data_dir + '31_XIndex_ratio1.txt'
small_molecule_matrix_file = data_dir + 'total_small_drugs.txt'
biotech_matrix_file = data_dir + 'total_biotech_drugs.txt'

# Load labels
def read_label_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        return np.array([int(line.strip()) for line in lines])

# Load combinations
def read_combinations_file(file_path):
    combinations = []
    with open(file_path, 'r') as file:
        for line in file:
            biotech_idx, small_idx = map(int, line.strip().split())
            combinations.append((biotech_idx, small_idx))
    return combinations

labels = read_label_file(labels_file_path)
combinations = read_combinations_file(combinations_file_path)

# Load feature matrices
small_molecule_total_matrix = np.loadtxt(small_molecule_matrix_file, dtype=float)
biotech_total_matrix = np.loadtxt(biotech_matrix_file, dtype=float)

# Generate features based on combinations
features = np.array([
    np.concatenate((small_molecule_total_matrix[small_idx], biotech_total_matrix[biotech_idx]))
    for biotech_idx, small_idx in combinations
])

# Validate dimensions
assert features.shape[0] == len(labels), "Mismatch between number of features and labels"

results_directory = '../RF_results'
os.makedirs(results_directory, exist_ok=True)
RF_results = results_directory + '/RF'
os.makedirs(RF_results, exist_ok=True)

# Prepare multi-class task
targets = labels
num_classes = 32  # Explicitly setting the number of classes

num_folds = 10
kfolder = StratifiedKFold(n_splits=num_folds, shuffle=True)
folds = list(kfolder.split(features, targets))

# Define compute_metrics function
def compute_metrics(y_true, y_pred, y_probs, num_classes):
    metrics = {"Micro": {}, "Macro": {}, "Weighted": {}}
    
    # Micro metrics
    metrics["Micro"]["Accuracy"] = accuracy_score(y_true, y_pred)
    metrics["Micro"]["Precision"], metrics["Micro"]["Recall"], metrics["Micro"]["F1"], _ = \
        precision_recall_fscore_support(y_true, y_pred, average='micro')
    metrics["Micro"]["MCC"] = matthews_corrcoef(y_true, y_pred)

    # Handle potential ValueError in AUC computation
    try:
        metrics["Micro"]["AUC"] = roc_auc_score(y_true, y_probs, average="micro", multi_class="ovr")
        metrics["Macro"]["AUC"] = roc_auc_score(y_true, y_probs, average="macro", multi_class="ovr")
        metrics["Weighted"]["AUC"] = roc_auc_score(y_true, y_probs, average="weighted", multi_class="ovr")
    except ValueError:
        metrics["Micro"]["AUC"] = np.nan
        metrics["Macro"]["AUC"] = np.nan
        metrics["Weighted"]["AUC"] = np.nan

    metrics["Micro"]["AUPR"] = average_precision_score(y_true, y_probs, average="micro")

    # Macro metrics
    precision_per_class, recall_per_class, f1_per_class, _ = \
        precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0)
    aupr_per_class = [
        average_precision_score((np.array(y_true) == i).astype(int), np.array(y_probs)[:, i])
        for i in range(num_classes)
    ]

    metrics["Macro"]["Precision"] = np.mean(precision_per_class)
    metrics["Macro"]["Recall"] = np.mean(recall_per_class)
    metrics["Macro"]["F1"] = np.mean(f1_per_class)
    metrics["Macro"]["MCC"] = matthews_corrcoef(y_true, y_pred)
    metrics["Macro"]["AUPR"] = np.mean(aupr_per_class)

    # Weighted metrics
    class_weights = np.bincount(y_true) / len(y_true)
    metrics["Weighted"]["Precision"] = np.sum(precision_per_class * class_weights)
    metrics["Weighted"]["Recall"] = np.sum(recall_per_class * class_weights)
    metrics["Weighted"]["F1"] = np.sum(f1_per_class * class_weights)
    metrics["Weighted"]["MCC"] = matthews_corrcoef(y_true, y_pred)
    metrics["Weighted"]["AUPR"] = np.average(aupr_per_class, weights=class_weights)

    return metrics

# Training and Evaluation
for num_estimators in nums_estimators:
    for criterion in criteria:
        for method in methods_max_features:
            directory = RF_results + '/' + str(num_estimators) + ' - ' + str(criterion) + ' - ' + str(method)
            os.makedirs(directory, exist_ok=True)
            
            all_metrics = {"Micro": [], "Macro": [], "Weighted": []}
            
            for fold_num, (train_indices, test_indices) in enumerate(folds, start=1):
                fold_directory = directory + f'/fold_{fold_num}'
                os.makedirs(fold_directory, exist_ok=True)
                print(f'Processing Fold {fold_num}/{num_folds} ({(fold_num/num_folds)*100:.1f}%)')

                
                X_train, y_train = features[train_indices], targets[train_indices]
                X_test, y_test = features[test_indices], targets[test_indices]
                
                classifier = RandomForestClassifier(
                    n_estimators=num_estimators, criterion=criterion, max_features=method, class_weight="balanced"
                )
                classifier.fit(X_train, y_train)
                y_hat = classifier.predict(X_test)
                class_probs = classifier.predict_proba(X_test)
                
                metrics = compute_metrics(y_test, y_hat, class_probs, num_classes)
                for fmt in ["Micro", "Macro", "Weighted"]:
                    with open(fold_directory + f'/metrics_{fmt}.txt', 'w') as fold_log_file:
                        fold_log_file.write(f"\n{fmt} Metrics for Fold {fold_num}:\n")
                        for metric, value in metrics[fmt].items():
                            fold_log_file.write(f"  {metric}: {value:.4f}\n")
                    all_metrics[fmt].append(metrics[fmt])
            
            avg_metrics = {fmt: {metric: np.mean([fold_metrics[metric] for fold_metrics in all_metrics[fmt]])
                                 for metric in all_metrics[fmt][0].keys()} for fmt in ["Micro", "Macro", "Weighted"]}


            std_metrics = {fmt: {metric: np.std([fold_metrics[metric] for fold_metrics in all_metrics[fmt]])
                                 for metric in all_metrics[fmt][0].keys()} for fmt in ["Micro", "Macro", "Weighted"]}



            with open(directory + '/average_metrics.txt', 'w') as avg_file:
                avg_file.write("\nAverage Metrics Across 10 Folds:\n")
                for fmt in ["Micro", "Macro", "Weighted"]:
                    avg_file.write(f"\n{fmt} Metrics:\n")
                    for metric, value in avg_metrics[fmt].items():
                        avg_file.write(f"  {metric}: {value:.4f} ± {std_metrics[fmt][metric]:.4f}\n")




Processing Fold 1/10 (10.0%)
Processing Fold 2/10 (20.0%)
Processing Fold 3/10 (30.0%)
Processing Fold 4/10 (40.0%)
Processing Fold 5/10 (50.0%)
Processing Fold 6/10 (60.0%)
Processing Fold 7/10 (70.0%)
Processing Fold 8/10 (80.0%)
Processing Fold 9/10 (90.0%)
Processing Fold 10/10 (100.0%)


XGBoost

In [None]:
import os
import csv
import time
import torch
import itertools
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef, roc_auc_score, average_precision_score, balanced_accuracy_score, confusion_matrix

# Parameters for XGBoost
n_estimators_list = [100]
max_depths = [3]
learning_rates = [0.1]
subsampling_rates = [0.5]
colsample_bytrees = [0.5]

# Read inputs for labels, indices, and total matrices
data_dir = '../Data/'
labels_file_path = data_dir + '31_Y_ratio1.txt'
combinations_file_path = data_dir + '31_XIndex_ratio1.txt'
small_molecule_matrix_file = data_dir + 'total_small_drugs.txt'
biotech_matrix_file = data_dir + 'total_biotech_drugs.txt'


# Load labels
def read_label_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        return np.array([int(line.strip()) for line in lines])

# Load combinations
def read_combinations_file(file_path):
    combinations = []
    with open(file_path, 'r') as file:
        for line in file:
            biotech_idx, small_idx = map(int, line.strip().split())
            combinations.append((biotech_idx, small_idx))
    return combinations

labels = read_label_file(labels_file_path)
combinations = read_combinations_file(combinations_file_path)

# Load feature matrices
small_molecule_total_matrix = np.loadtxt(small_molecule_matrix_file, dtype=float)
biotech_total_matrix = np.loadtxt(biotech_matrix_file, dtype=float)

# Generate features based on combinations
features = np.array([
    np.concatenate((small_molecule_total_matrix[small_idx], biotech_total_matrix[biotech_idx]))
    for biotech_idx, small_idx in combinations
])

# Validate dimensions
assert features.shape[0] == len(labels), "Mismatch between number of features and labels"

results_directory = '../XGB_results'
os.makedirs(results_directory, exist_ok=True)
XGB_results = results_directory + '/XGB'
os.makedirs(XGB_results, exist_ok=True)

# Prepare multi-class task
targets = labels
num_classes = 32  # Explicitly setting the number of classes

num_folds = 10
kfolder = StratifiedKFold(n_splits=num_folds, shuffle=True)
folds = list(kfolder.split(features, targets))


# Define compute_metrics function
def compute_metrics(y_true, y_pred, y_probs, num_classes):
    metrics = {"Micro": {}, "Macro": {}, "Weighted": {}}
    
    # Micro metrics
    metrics["Micro"]["Accuracy"] = accuracy_score(y_true, y_pred)
    metrics["Micro"]["Precision"], metrics["Micro"]["Recall"], metrics["Micro"]["F1"], _ = \
        precision_recall_fscore_support(y_true, y_pred, average='micro')
    metrics["Micro"]["MCC"] = matthews_corrcoef(y_true, y_pred)

    # Handle potential ValueError in AUC computation
    try:
        metrics["Micro"]["AUC"] = roc_auc_score(y_true, y_probs, average="micro", multi_class="ovr")
        metrics["Macro"]["AUC"] = roc_auc_score(y_true, y_probs, average="macro", multi_class="ovr")
        metrics["Weighted"]["AUC"] = roc_auc_score(y_true, y_probs, average="weighted", multi_class="ovr")
    except ValueError:
        metrics["Micro"]["AUC"] = np.nan
        metrics["Macro"]["AUC"] = np.nan
        metrics["Weighted"]["AUC"] = np.nan

    metrics["Micro"]["AUPR"] = average_precision_score(y_true, y_probs, average="micro")

    # Macro metrics
    precision_per_class, recall_per_class, f1_per_class, _ = \
        precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0)
    aupr_per_class = [
        average_precision_score((np.array(y_true) == i).astype(int), np.array(y_probs)[:, i])
        for i in range(num_classes)
    ]

    metrics["Macro"]["Precision"] = np.mean(precision_per_class)
    metrics["Macro"]["Recall"] = np.mean(recall_per_class)
    metrics["Macro"]["F1"] = np.mean(f1_per_class)
    metrics["Macro"]["MCC"] = matthews_corrcoef(y_true, y_pred)
    metrics["Macro"]["AUPR"] = np.mean(aupr_per_class)

    # Weighted metrics
    class_weights = np.bincount(y_true) / len(y_true)
    metrics["Weighted"]["Precision"] = np.sum(precision_per_class * class_weights)
    metrics["Weighted"]["Recall"] = np.sum(recall_per_class * class_weights)
    metrics["Weighted"]["F1"] = np.sum(f1_per_class * class_weights)
    metrics["Weighted"]["MCC"] = matthews_corrcoef(y_true, y_pred)
    metrics["Weighted"]["AUPR"] = np.average(aupr_per_class, weights=class_weights)

    return metrics



for n_estimators in n_estimators_list:
    for max_depth in max_depths:
        for learning_rate in learning_rates:
            for subsample in subsampling_rates:
                for colsample_bytree in colsample_bytrees:
                    config = f"n{n_estimators}_d{max_depth}_lr{learning_rate}_sub{subsample}_col{colsample_bytree}"
                    directory = XGB_results + '/' + config
                    os.makedirs(directory, exist_ok=True)
                    
                    all_metrics = {"Micro": [], "Macro": [], "Weighted": []}
                    
                    for fold_num, (train_indices, test_indices) in enumerate(folds, start=1):
                        fold_directory = directory + f'/fold_{fold_num}'
                        os.makedirs(fold_directory, exist_ok=True)
                        print(f'Processing Fold {fold_num}/{num_folds} ({(fold_num/num_folds)*100:.1f}%)')
                        
                        X_train, y_train = features[train_indices], targets[train_indices]
                        X_test, y_test = features[test_indices], targets[test_indices]
                        
                        classifier = xgb.XGBClassifier(
                            n_estimators=n_estimators,
                            max_depth=max_depth,
                            learning_rate=learning_rate,
                            subsample=subsample,
                            colsample_bytree=colsample_bytree,
                            use_label_encoder=False,
                            eval_metric="mlogloss",
                            objective="multi:softprob",  # Ensures correct probability outputs for multi-class
                            num_class=num_classes,
                            random_state=42
                        )
                        classifier.fit(X_train, y_train)
                        y_hat = classifier.predict(X_test)
                        class_probs = classifier.predict_proba(X_test)
                        
                        metrics = compute_metrics(y_test, y_hat, class_probs, num_classes)
                        for fmt in ["Micro", "Macro", "Weighted"]:
                            with open(fold_directory + f'/metrics_{fmt}.txt', 'w') as fold_log_file:
                                fold_log_file.write(f"\n{fmt} Metrics for Fold {fold_num}:\n")
                                for metric, value in metrics[fmt].items():
                                    fold_log_file.write(f"  {metric}: {value:.4f}\n")
                            all_metrics[fmt].append(metrics[fmt])
                    
                    avg_metrics = {fmt: {metric: np.mean([fold_metrics[metric] for fold_metrics in all_metrics[fmt]])
                                         for metric in all_metrics[fmt][0].keys()} for fmt in ["Micro", "Macro", "Weighted"]}
                    std_metrics = {fmt: {metric: np.std([fold_metrics[metric] for fold_metrics in all_metrics[fmt]])
                                         for metric in all_metrics[fmt][0].keys()} for fmt in ["Micro", "Macro", "Weighted"]}



                    with open(directory + '/average_metrics.txt', 'w') as avg_file:
                        avg_file.write("\nAverage Metrics Across 10 Folds:\n")
                        for fmt in ["Micro", "Macro", "Weighted"]:
                            avg_file.write(f"\n{fmt} Metrics:\n")
                            for metric, value in avg_metrics[fmt].items():
                                avg_file.write(f"  {metric}: {value:.4f} ± {std_metrics[fmt][metric]:.4f}\n")


Processing Fold 1/10 (10.0%)


SVM

In [None]:
import os
import csv
import time
import torch
import itertools
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef, roc_auc_score, average_precision_score, balanced_accuracy_score, confusion_matrix

# Parameters for SVM
kernels = ['linear']
Cs = [10]

# Read inputs for labels, indices, and total matrices
data_dir = '../Data/'
labels_file_path = data_dir + '31_Y_ratio1.txt'
combinations_file_path = data_dir + '31_XIndex_ratio1.txt'
small_molecule_matrix_file = data_dir + 'total_small_drugs.txt'
biotech_matrix_file = data_dir + 'total_biotech_drugs.txt'
# Load labels
def read_label_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        return np.array([int(line.strip()) for line in lines])

# Load combinations
def read_combinations_file(file_path):
    combinations = []
    with open(file_path, 'r') as file:
        for line in file:
            biotech_idx, small_idx = map(int, line.strip().split())
            combinations.append((biotech_idx, small_idx))
    return combinations

labels = read_label_file(labels_file_path)
combinations = read_combinations_file(combinations_file_path)

# Load feature matrices
small_molecule_total_matrix = np.loadtxt(small_molecule_matrix_file, dtype=float)
biotech_total_matrix = np.loadtxt(biotech_matrix_file, dtype=float)

# Generate features based on combinations
features = np.array([
    np.concatenate((small_molecule_total_matrix[small_idx], biotech_total_matrix[biotech_idx]))
    for biotech_idx, small_idx in combinations
])

# Validate dimensions
assert features.shape[0] == len(labels), "Mismatch between number of features and labels"

results_directory = '../SVM_results'
os.makedirs(results_directory, exist_ok=True)
SVM_results = results_directory + '/SVM'
os.makedirs(SVM_results, exist_ok=True)

# Prepare multi-class task
targets = labels
num_classes = 32  # Explicitly setting the number of classes

num_folds = 10
kfolder = StratifiedKFold(n_splits=num_folds, shuffle=True)
folds = list(kfolder.split(features, targets))

# Define compute_metrics function
def compute_metrics(y_true, y_pred, y_probs, num_classes):
    metrics = {"Micro": {}, "Macro": {}, "Weighted": {}}
    
    # Micro metrics
    metrics["Micro"]["Accuracy"] = accuracy_score(y_true, y_pred)
    metrics["Micro"]["Precision"], metrics["Micro"]["Recall"], metrics["Micro"]["F1"], _ = \
        precision_recall_fscore_support(y_true, y_pred, average='micro')
    metrics["Micro"]["MCC"] = matthews_corrcoef(y_true, y_pred)

    # Handle potential ValueError in AUC computation
    try:
        metrics["Micro"]["AUC"] = roc_auc_score(y_true, y_probs, average="micro", multi_class="ovr")
        metrics["Macro"]["AUC"] = roc_auc_score(y_true, y_probs, average="macro", multi_class="ovr")
        metrics["Weighted"]["AUC"] = roc_auc_score(y_true, y_probs, average="weighted", multi_class="ovr")
    except ValueError:
        metrics["Micro"]["AUC"] = np.nan
        metrics["Macro"]["AUC"] = np.nan
        metrics["Weighted"]["AUC"] = np.nan

    metrics["Micro"]["AUPR"] = average_precision_score(y_true, y_probs, average="micro")

    # Macro metrics
    precision_per_class, recall_per_class, f1_per_class, _ = \
        precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0)
    aupr_per_class = [
        average_precision_score((np.array(y_true) == i).astype(int), np.array(y_probs)[:, i])
        for i in range(num_classes)
    ]

    metrics["Macro"]["Precision"] = np.mean(precision_per_class)
    metrics["Macro"]["Recall"] = np.mean(recall_per_class)
    metrics["Macro"]["F1"] = np.mean(f1_per_class)
    metrics["Macro"]["MCC"] = matthews_corrcoef(y_true, y_pred)
    metrics["Macro"]["AUPR"] = np.mean(aupr_per_class)

    # Weighted metrics
    class_weights = np.bincount(y_true) / len(y_true)
    metrics["Weighted"]["Precision"] = np.sum(precision_per_class * class_weights)
    metrics["Weighted"]["Recall"] = np.sum(recall_per_class * class_weights)
    metrics["Weighted"]["F1"] = np.sum(f1_per_class * class_weights)
    metrics["Weighted"]["MCC"] = matthews_corrcoef(y_true, y_pred)
    metrics["Weighted"]["AUPR"] = np.average(aupr_per_class, weights=class_weights)

    return metrics


for kernel in kernels:
    for C in Cs:
        directory = SVM_results + '/' + str(kernel) + ' - ' + str(C)
        os.makedirs(directory, exist_ok=True)
        
        all_metrics = {"Micro": [], "Macro": [], "Weighted": []}
        
        for fold_num, (train_indices, test_indices) in enumerate(folds, start=1):
            fold_directory = directory + f'/fold_{fold_num}'
            os.makedirs(fold_directory, exist_ok=True)
            print(f'Processing Fold {fold_num}/{num_folds} ({(fold_num/num_folds)*100:.1f}%)')

            X_train, y_train = features[train_indices], targets[train_indices]
            X_test, y_test = features[test_indices], targets[test_indices]
            
            classifier = SVC(
                kernel=kernel,
                C=C,
                probability=True,
                decision_function_shape="ovr",
                class_weight="balanced"
            )
            classifier.fit(X_train, y_train)
            y_hat = classifier.predict(X_test)
            class_probs = classifier.predict_proba(X_test)
            
            metrics = compute_metrics(y_test, y_hat, class_probs, num_classes)
            for fmt in ["Micro", "Macro", "Weighted"]:
                with open(fold_directory + f'/metrics_{fmt}.txt', 'w') as fold_log_file:
                    fold_log_file.write(f"\n{fmt} Metrics for Fold {fold_num}:\n")
                    for metric, value in metrics[fmt].items():
                        fold_log_file.write(f"  {metric}: {value:.4f}\n")
                all_metrics[fmt].append(metrics[fmt])
        
        avg_metrics = {fmt: {metric: np.mean([fold_metrics[metric] for fold_metrics in all_metrics[fmt]])
                             for metric in all_metrics[fmt][0].keys()} for fmt in ["Micro", "Macro", "Weighted"]}


        std_metrics = {fmt: {metric: np.std([fold_metrics[metric] for fold_metrics in all_metrics[fmt]])
                             for metric in all_metrics[fmt][0].keys()} for fmt in ["Micro", "Macro", "Weighted"]}



        with open(directory + '/average_metrics.txt', 'w') as avg_file:
            avg_file.write("\nAverage Metrics Across 10 Folds:\n")
            for fmt in ["Micro", "Macro", "Weighted"]:
                avg_file.write(f"\n{fmt} Metrics:\n")
                for metric, value in avg_metrics[fmt].items():
                    avg_file.write(f"  {metric}: {value:.4f} ± {std_metrics[fmt][metric]:.4f}\n")



linear - 10
Processing Fold 1/10 (10.0%)
