In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (accuracy_score, confusion_matrix, roc_curve, auc)
import matplotlib.pyplot as plt
import seaborn as sns
import autosklearn.classification
import joblib

In [2]:
import autosklearn.regression
import autosklearn.classification
import autosklearn.classification as classifier
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import (accuracy,
                                 f1,
                                 roc_auc,
                                 precision,
                                 average_precision,
                                 recall,
                                 log_loss,
                                 r2,
                                 mean_squared_error,
                                 mean_absolute_error,
                                 )
import autosklearn.regression
from sklearn.utils.fixes import _joblib_parallel_args
from sklearn.model_selection import train_test_split, StratifiedKFold

In [1]:
# === Load expression data ===
expr = pd.read_csv("BRCA_VST_Normalized_Matrix.csv", index_col=0)
meta = pd.read_csv("BRCA_Metadata_Final.csv", index_col=0)
genes_df = pd.read_excel("Common_Genes_BRCA.xlsx", header=None)
selected_genes = genes_df.iloc[:, 0].dropna().tolist()

# === Filter matrix ===
expr = expr.loc[expr.index.str.upper().isin([g.upper() for g in selected_genes])]
expr.index = expr.index.str.upper()
expr = expr.loc[[g.upper() for g in selected_genes]]  # preserve order
X = expr.T

# === Labels ===
meta['Group'] = meta['sample_type'].replace({
    "Solid Tissue Normal": "Normal",
    "Primary Tumor": "Tumor"
})
y = meta.loc[X.index, 'Group'].values

In [2]:
# === Encode & scale ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Normal=0, Tumor=1

In [3]:
# === Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [4]:
# === Apply SMOTE on training data ===
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [14]:
# === AutoSklearn ===
skf = StratifiedKFold(n_splits=3)
clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=18000,
    memory_limit=2048,
    resampling_strategy=skf,
    ensemble_kwargs={'ensemble_size': 3},
    metric=autosklearn.metrics.roc_auc
)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [15]:
clf.fit(X_train_resampled, y_train_resampled, X_test=X_test, y_test=y_test)

[ERROR] [2025-07-25 17:33:12,227:Client-AutoML(1):e8468cba-695b-11f0-8f27-8c859067870a] (' Dummy prediction failed with run state StatusType.CRASHED and additional output: {\'error\': \'Result queue is empty\', \'exit_status\': "<class \'pynisher.limit_function_call.AnythingException\'>", \'subprocess_stdout\': \'\', \'subprocess_stderr\': \'Process pynisher function call:\\nTraceback (most recent call last):\\n  File "/opt/anaconda3/envs/Autosklearn/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap\\n    self.run()\\n  File "/opt/anaconda3/envs/Autosklearn/lib/python3.8/multiprocessing/process.py", line 108, in run\\n    self._target(*self._args, **self._kwargs)\\n  File "/opt/anaconda3/envs/Autosklearn/lib/python3.8/site-packages/pynisher/limit_function_call.py", line 108, in subprocess_func\\n    resource.setrlimit(resource.RLIMIT_AS, (mem_in_b, mem_in_b))\\nValueError: current limit exceeds maximum limit\\n\', \'exitcode\': 1, \'configuration_origin\': \'DUMMY\'}.'

ValueError: (' Dummy prediction failed with run state StatusType.CRASHED and additional output: {\'error\': \'Result queue is empty\', \'exit_status\': "<class \'pynisher.limit_function_call.AnythingException\'>", \'subprocess_stdout\': \'\', \'subprocess_stderr\': \'Process pynisher function call:\\nTraceback (most recent call last):\\n  File "/opt/anaconda3/envs/Autosklearn/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap\\n    self.run()\\n  File "/opt/anaconda3/envs/Autosklearn/lib/python3.8/multiprocessing/process.py", line 108, in run\\n    self._target(*self._args, **self._kwargs)\\n  File "/opt/anaconda3/envs/Autosklearn/lib/python3.8/site-packages/pynisher/limit_function_call.py", line 108, in subprocess_func\\n    resource.setrlimit(resource.RLIMIT_AS, (mem_in_b, mem_in_b))\\nValueError: current limit exceeds maximum limit\\n\', \'exitcode\': 1, \'configuration_origin\': \'DUMMY\'}.',)

In [None]:
# === Predict ===
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [None]:
# === ROC Curve ===
y_proba = clf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc = auc(fpr, tpr)

In [None]:
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.tight_layout()
plt.savefig("BRCA_ROC_Curve.png", dpi=600)
plt.close()

In [None]:
# === Confusion Matrix Plot ===
plt.figure(figsize=(4, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Tumor'], yticklabels=['Normal', 'Tumor'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig("BRCA_Confusion_Matrix.png", dpi=600)
plt.close()

In [None]:
# === Metrics ===
if cm.shape == (2, 2):
    TN, FP, FN, TP = cm.ravel()
    metrics = {
        'Accuracy': acc,
        'Predictive Value (Normal)': TN / (TN + FN) if (TN + FN) else 0,
        'Predictive Value (Tumor)': TP / (TP + FP) if (TP + FP) else 0,
        'Precision (Normal)': TN / (TN + FP) if (TN + FP) else 0,
        'Precision (Tumor)': TP / (TP + FP) if (TP + FP) else 0,
        'Recall (Normal)': TN / (TN + FN) if (TN + FN) else 0,
        'Recall (Tumor)': TP / (TP + FN) if (TP + FN) else 0,
        'F1 Score (Normal)': (2 * TN / (TN + FP) * TN / (TN + FN)) /
                             (TN / (TN + FP) + TN / (TN + FN)) if (TN + FP + TN + FN) else 0,
        'F1 Score (Tumor)': (2 * TP / (TP + FP) * TP / (TP + FN)) /
                            (TP / (TP + FP) + TP / (TP + FN)) if (TP + FP + TP + FN) else 0,
        'F1 Macro Average': (
            ((2 * TN / (TN + FP) * TN / (TN + FN)) /
             (TN / (TN + FP) + TN / (TN + FN)) +
             (2 * TP / (TP + FP) * TP / (TP + FN)) /
             (TP / (TP + FP) + TP / (TP + FN)))
        ) / 2 if all([TN + FP + TN + FN, TP + FP + TP + FN]) else 0,
        'Sensitivity': TP / (TP + FN) if (TP + FN) != 0 else 0,
        'Specificity': TN / (TN + FP) if (TN + FP) != 0 else 0,
        'ROC AUC': roc,
    }
else:
    metrics = {'Error': 'Confusion matrix not 2x2 – check labels.'}

In [None]:
metrics_df = pd.DataFrame([metrics])
metrics_df.to_excel("BRCA_AutoSklearn_Metrics.xlsx", index=False)

# === Save model ===
joblib.dump(clf, "BRCA_AutoSklearn_Model_SMOTE.joblib")