## Import

In [None]:
!pip install lifelines
!pip install scikit-learn
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.regularizers import l1_l2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read Data

In [None]:
from google.colab import drive
import os

# connect to goodle drive

from google.colab import drive
drive.mount('/content/drive') # to read files from your google drive

import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks') # to import our custom module from this path later

read_path = '/content/drive/My Drive/AIIM/Final/' # modify this line according to your path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# read the npz file
import numpy as np

data_l = np.load(os.path.join(read_path, "breast_all.npz"), allow_pickle=True)
data_ul = np.load(os.path.join(read_path, "breast_unlabeled.npz"), allow_pickle=True)

print("Keys: ", data_l.files, '\n')
print("Keys: ", data_ul.files, '\n')

train_data_path = read_path + "train_data.csv"
test_data_path = read_path + "test_data.csv"

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

Keys:  ['x_train', 'c_train', 'y_train', 'o_train', 'e_train', 'x_test', 'c_test', 'y_test', 'o_test', 'e_test', 'gene_name', 'clinical_feature'] 

Keys:  ['x_w_full', 'c_w_full', 'x_n_full', 'c_n_full'] 



In [None]:
# labeled data
x_train = data_l["x_train"] # genetic data
c_train = data_l["c_train"] # clinical data
y_train = data_l["y_train"]

x_test = data_l["x_test"]
c_test = data_l["c_test"]
y_test = data_l["y_test"]

# unlabeled data
x_ul = data_ul["x_w_full"]
c_ul = data_ul["c_w_full"]

# info
gene_name = data_l["gene_name"]
clinical_feature = data_l["clinical_feature"]

## Semi-Supervised Learning (SSL)

###Basic SSL with 10 Loops Limitation

**Model**
```
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])
```

**Explination**
- `("scaler", StandardScaler())`
    
    Standardizes the features

- `("clf", RandomForestClassifier(n_estimators=100, random_state=42))`

  - Random Forest Classifier
  - `n_estimators=100`: Specifies the number of decision trees in the forest.
  - `random_state=42`: Ensures reproducibility of results by fixing the random seed.


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Combine gene expression and clinical data
def combine_features(gene_data, clinical_data):
    return np.hstack((gene_data, clinical_data))

# Convert labels from True/False to 1/0
y_train = np.array([1 if label else 0 for label in y_train])

# Combine labeled data
X = combine_features(x_train, c_train)
y = y_train

# Split labeled data into a training set and a test set
X_train, X_test, y_train_split, y_test_split = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create a pipeline with scaling and a classifier
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Self-training
max_iterations = 10
confidence_threshold = 0.9
X_labeled = X_train.copy()
y_labeled = y_train_split.copy()
X_unlabeled = combine_features(x_ul, c_ul)
pseudo_labels = []

for iteration in range(max_iterations):
    print(f"Iteration {iteration + 1}...")

    # Train the model on the labeled dataset
    base_model.fit(X_labeled, y_labeled)

    # Predict probabilities on the unlabeled dataset
    probs = base_model.predict_proba(X_unlabeled)
    pseudo_labels = np.argmax(probs, axis=1)  # Predicted labels
    confidence_scores = np.max(probs, axis=1)  # Max probabilities

    # Select confident predictions
    confident_indices = np.where(confidence_scores >= confidence_threshold)[0]
    if len(confident_indices) == 0:
        print("No confident predictions in this iteration. Stopping...")
        break

    # Add confident predictions to the labeled dataset
    X_labeled = np.vstack((X_labeled, X_unlabeled[confident_indices]))
    y_labeled = np.hstack((y_labeled, pseudo_labels[confident_indices]))
    X_unlabeled = np.delete(X_unlabeled, confident_indices, axis=0)

    print(f"Added {len(confident_indices)} pseudo-labeled samples.")

# Evaluate on split test set
y_test_pred = base_model.predict(X_test)
test_accuracy = accuracy_score(y_test_split, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Combine original and pseudo-labeled data
labeled_data = pd.DataFrame(np.hstack((X_train, y_train_split.reshape(-1, 1))),
                            columns=[*gene_name, *clinical_feature, 'Label'])

pseudo_labeled_data = pd.DataFrame(
    np.hstack((X_labeled[len(X_train):], y_labeled[len(X_train):].reshape(-1, 1))),
    columns=[*gene_name, *clinical_feature, 'Label']
)

# Ensure labels are stored as 0/1
labeled_data['Label'] = labeled_data['Label'].astype(int)
pseudo_labeled_data['Label'] = pseudo_labeled_data['Label'].astype(int)

combined_data = pd.concat([labeled_data, pseudo_labeled_data], ignore_index=True)

# Save to CSV
output_file = "/content/drive/My Drive/AIIM/Final/combined_labeled_data.csv"
combined_data.to_csv(output_file, index=False)
print(f"Combined labeled data saved to {output_file}")


Iteration 1...
Added 18 pseudo-labeled samples.
Iteration 2...
Added 15 pseudo-labeled samples.
Iteration 3...
Added 13 pseudo-labeled samples.
Iteration 4...
Added 14 pseudo-labeled samples.
Iteration 5...
Added 21 pseudo-labeled samples.
Iteration 6...
Added 25 pseudo-labeled samples.
Iteration 7...
Added 29 pseudo-labeled samples.
Iteration 8...
Added 33 pseudo-labeled samples.
Iteration 9...
Added 25 pseudo-labeled samples.
Iteration 10...
Added 16 pseudo-labeled samples.
Test Accuracy: 0.6989
Combined labeled data saved to /content/drive/My Drive/AIIM/Final/combined_labeled_data.csv


###Basic SSL with No Loop Limitation (USE THIS ONE)

**Model**
```
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])
```

**Explination**
- `("scaler", StandardScaler())`
    
    Standardizes the features

- `("clf", RandomForestClassifier(n_estimators=100, random_state=42))`

  - Random Forest Classifier
  - `n_estimators=100`: Specifies the number of decision trees in the forest.
  - `random_state=42`: Ensures reproducibility of results by fixing the random seed.


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Combine gene expression and clinical data
def combine_features(gene_data, clinical_data):
    return np.hstack((gene_data, clinical_data))

# Convert labels from True/False to 1/0
y_train = np.array([1 if label else 0 for label in y_train])

# Combine labeled data
X = combine_features(x_train, c_train)
y = y_train

# Split labeled data into a training set and a test set
X_train, X_test, y_train_split, y_test_split = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create a pipeline with scaling and a classifier
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Self-training
confidence_threshold = 0.9
X_labeled = X_train.copy()
y_labeled = y_train_split.copy()
X_unlabeled = combine_features(x_ul, c_ul)
confidence_labeled = np.ones(len(y_labeled))  # Confidence for labeled data is 1
pseudo_labels = []
confidence_scores_pseudo = []

total_added = 0  # Counter to track total number of pseudo-labeled data added

while True:
    print("Starting new iteration...")

    # Train the model on the labeled dataset
    base_model.fit(X_labeled, y_labeled)

    # Predict probabilities on the unlabeled dataset
    probs = base_model.predict_proba(X_unlabeled)
    pseudo_labels = np.argmax(probs, axis=1)  # Predicted labels
    confidence_scores = np.max(probs, axis=1)  # Max probabilities

    # Select confident predictions
    confident_indices = np.where(confidence_scores >= confidence_threshold)[0]
    if len(confident_indices) == 0:
        print("No confident predictions left. Stopping...")
        break

    # Add confident predictions to the labeled dataset
    X_labeled = np.vstack((X_labeled, X_unlabeled[confident_indices]))
    y_labeled = np.hstack((y_labeled, pseudo_labels[confident_indices]))
    confidence_labeled = np.hstack((confidence_labeled, confidence_scores[confident_indices]))
    X_unlabeled = np.delete(X_unlabeled, confident_indices, axis=0)

    total_added += len(confident_indices)  # Update the total added counter

    print(f"Added {len(confident_indices)} pseudo-labeled samples.")

# Output the total number of pseudo-labeled data added
print(f"Total pseudo-labeled data added: {total_added}")

# Combine original and pseudo-labeled data
labeled_data = pd.DataFrame(np.hstack((X_train, y_train_split.reshape(-1, 1), np.ones((len(y_train_split), 1)))),
                            columns=[*gene_name, *clinical_feature, 'Label', 'Confidence'])

pseudo_labeled_data = pd.DataFrame(
    np.hstack((X_labeled[len(X_train):], y_labeled[len(X_train):].reshape(-1, 1), confidence_labeled[len(X_train):].reshape(-1, 1))),
    columns=[*gene_name, *clinical_feature, 'Label', 'Confidence']
)

# Ensure labels are stored as 0/1
labeled_data['Label'] = labeled_data['Label'].astype(int)
pseudo_labeled_data['Label'] = pseudo_labeled_data['Label'].astype(int)

combined_data = pd.concat([labeled_data, pseudo_labeled_data], ignore_index=True)

# Save to CSV
output_file = "/content/drive/My Drive/AIIM/Final/combined_labeled_data_w_confidence.csv"
combined_data.to_csv(output_file, index=False)
print(f"Combined labeled data saved to {output_file}")


Starting new iteration...
Added 18 pseudo-labeled samples.
Starting new iteration...
Added 15 pseudo-labeled samples.
Starting new iteration...
Added 13 pseudo-labeled samples.
Starting new iteration...
Added 14 pseudo-labeled samples.
Starting new iteration...
Added 21 pseudo-labeled samples.
Starting new iteration...
Added 25 pseudo-labeled samples.
Starting new iteration...
Added 29 pseudo-labeled samples.
Starting new iteration...
Added 33 pseudo-labeled samples.
Starting new iteration...
Added 25 pseudo-labeled samples.
Starting new iteration...
Added 16 pseudo-labeled samples.
Starting new iteration...
Added 17 pseudo-labeled samples.
Starting new iteration...
Added 19 pseudo-labeled samples.
Starting new iteration...
Added 9 pseudo-labeled samples.
Starting new iteration...
Added 14 pseudo-labeled samples.
Starting new iteration...
Added 10 pseudo-labeled samples.
Starting new iteration...
Added 6 pseudo-labeled samples.
Starting new iteration...
Added 7 pseudo-labeled samples.


### Gradient Boosting - XGBoost

- with Hyperparameter Tuning
- the result is terrible haha
- dump!

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from scipy.stats import uniform, randint

# Combine gene expression and clinical data
def combine_features(gene_data, clinical_data):
    return np.hstack((gene_data, clinical_data))

# Convert labels from True/False to 1/0
y_train = np.array([1 if label else 0 for label in y_train])

# Combine labeled data
X = combine_features(x_train, c_train)
y = y_train

# Split labeled data into a training set and a test set
X_train, X_test, y_train_split, y_test_split = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Prepare the unlabeled data
X_unlabeled = combine_features(x_ul, c_ul)
X_unlabeled = scaler.transform(X_unlabeled)

# Define the hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 500),               # Number of boosting rounds
    'max_depth': randint(3, 10),                      # Depth of trees
    'learning_rate': uniform(1e-3, 1e-1),              # Learning rate
    'subsample': uniform(0.6, 0.2),                   # Subsample ratio
    'colsample_bytree': uniform(0.6, 0.2),            # Column sampling
    'scale_pos_weight': uniform(1, 10),               # Class imbalance weight
}

# Initialize the XGBoost model
model = XGBClassifier(
    # use_label_encoder=False,
    eval_metric="logloss"
)

# Set up RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter settings to try
    scoring='accuracy',
    cv=3,        # 3-fold cross-validation
    random_state=42,
    verbose=2
)

# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train, y_train_split)

# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Output the best parameters
print("Best parameters found: ", best_params)

# Self-training
confidence_threshold = 0.9
X_labeled = X_train.copy()
y_labeled = y_train_split.copy()
confidence_labeled = np.ones(len(y_labeled))  # Confidence for labeled data is 1
pseudo_labels = []
confidence_scores_pseudo = []

total_added = 0  # Counter to track total number of pseudo-labeled data added

while True:
    print("Starting new iteration...")

    # Train the best model on the labeled dataset
    best_model.fit(X_labeled, y_labeled)

    # Predict probabilities on the unlabeled dataset
    probs = best_model.predict_proba(X_unlabeled)
    pseudo_labels = np.argmax(probs, axis=1)  # Predicted labels
    confidence_scores = np.max(probs, axis=1)  # Max probabilities

    # Select confident predictions
    confident_indices = np.where(confidence_scores >= confidence_threshold)[0]
    if len(confident_indices) == 0:
        print("No confident predictions left. Stopping...")
        break

    # Add confident predictions to the labeled dataset
    X_labeled = np.vstack((X_labeled, X_unlabeled[confident_indices]))
    y_labeled = np.hstack((y_labeled, pseudo_labels[confident_indices]))
    confidence_labeled = np.hstack((confidence_labeled, confidence_scores[confident_indices]))
    X_unlabeled = np.delete(X_unlabeled, confident_indices, axis=0)

    total_added += len(confident_indices)  # Update the total added counter

    print(f"Added {len(confident_indices)} pseudo-labeled samples.")

# Output the total number of pseudo-labeled data added
print(f"Total pseudo-labeled data added: {total_added}")

# Combine original and pseudo-labeled data
labeled_data = pd.DataFrame(np.hstack((X_train, y_train_split.reshape(-1, 1), np.ones((len(y_train_split), 1)))),
                            columns=[*gene_name, *clinical_feature, 'Label', 'Confidence'])

pseudo_labeled_data = pd.DataFrame(
    np.hstack((X_labeled[len(X_train):], y_labeled[len(X_train):].reshape(-1, 1), confidence_labeled[len(X_train):].reshape(-1, 1))),
    columns=[*gene_name, *clinical_feature, 'Label', 'Confidence']
)

# Ensure labels are stored as 0/1
labeled_data['Label'] = labeled_data['Label'].astype(int)
pseudo_labeled_data['Label'] = pseudo_labeled_data['Label'].astype(int)

combined_data = pd.concat([labeled_data, pseudo_labeled_data], ignore_index=True)

# Save to CSV
output_file = "/content/drive/My Drive/AIIM/Final/combined_labeled_data_xgboost_tuned.csv"
combined_data.to_csv(output_file, index=False)
print(f"Combined labeled data saved to {output_file}")


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.6749080237694725, learning_rate=0.09607143064099162, max_depth=5, n_estimators=171, scale_pos_weight=6.986584841970366, subsample=0.6312037280884872; total time=   0.5s
[CV] END colsample_bytree=0.6749080237694725, learning_rate=0.09607143064099162, max_depth=5, n_estimators=171, scale_pos_weight=6.986584841970366, subsample=0.6312037280884872; total time=   1.5s
[CV] END colsample_bytree=0.6749080237694725, learning_rate=0.09607143064099162, max_depth=5, n_estimators=171, scale_pos_weight=6.986584841970366, subsample=0.6312037280884872; total time=   3.2s
[CV] END colsample_bytree=0.6311989040672406, learning_rate=0.006808361216819946, max_depth=7, n_estimators=199, scale_pos_weight=2.428668179219408, subsample=0.7301776945897706; total time=   0.8s
[CV] END colsample_bytree=0.6311989040672406, learning_rate=0.006808361216819946, max_depth=7, n_estimators=199, scale_pos_weight=2.428668179219408, 

### Co-Training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler

In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.fc1 = nn.Linear(input_dim, 400)
        self.fc21 = nn.Linear(400, latent_dim)  # Mean of latent variable
        self.fc22 = nn.Linear(400, latent_dim)  # Log-variance of latent variable
        self.fc3 = nn.Linear(latent_dim, 400)
        self.fc4 = nn.Linear(400, input_dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)  # Returns mean and log-variance

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))  # Sigmoid to squash output between 0 and 1

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

    def loss_function(self, recon_x, x, mu, logvar):
        BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
        # KL divergence between the learned distribution and a unit Gaussian
        MSE = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return BCE + MSE

In [None]:
class ClassifierWithVariationalDropout(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.5):
        super(ClassifierWithVariationalDropout, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.dropout_rate = torch.nn.Parameter(torch.tensor([dropout_rate]))  # Learnable dropout rate

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=self.dropout_rate.item(), training=self.training)
        return self.fc2(x)


In [None]:
#### DATA PREPARATION ####

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

# Normalize gene and clinical data using MinMaxScaler
scaler_gene = MinMaxScaler()
X_gene_scaled = scaler_gene.fit_transform(x_train)  # Normalize labeled gene data
X_gene_ul_scaled = scaler_gene.transform(x_ul)                # Unlabeled data

scaler_clinical = MinMaxScaler()
X_clinical_scaled = scaler_clinical.fit_transform(c_train)  # Normalize labeled clinical data

# Convert labels (True/False) to integers (1/0)
Y_train = (y_train == 'True').astype(int)

# Convert to PyTorch tensors
X_gene_tensor = torch.tensor(X_gene_scaled, dtype=torch.float32)
X_clinical_tensor = torch.tensor(X_clinical_scaled, dtype=torch.float32)
Y_tensor = torch.tensor(Y_train, dtype=torch.long)  # Use long for classification targets

# Split labeled data into training and testing sets
X_gene_train, X_gene_test, Y_train, Y_test = train_test_split(X_gene_tensor, Y_tensor, test_size=0.2, random_state=42)
X_clinical_train, X_clinical_test = train_test_split(X_clinical_tensor, test_size=0.2, random_state=42)

# Create DataLoaders for training and validation
train_data_gene = TensorDataset(X_gene_train, Y_train)
test_data_gene = TensorDataset(X_gene_test, Y_test)

train_data_clinical = TensorDataset(X_clinical_train, Y_train)
test_data_clinical = TensorDataset(X_clinical_test, Y_test)

train_loader_gene = DataLoader(train_data_gene, batch_size=32, shuffle=True)
test_loader_gene = DataLoader(test_data_gene, batch_size=32, shuffle=False)

train_loader_clinical = DataLoader(train_data_clinical, batch_size=32, shuffle=True)
test_loader_clinical = DataLoader(test_data_clinical, batch_size=32, shuffle=False)


In [None]:
#### TRAINING ####

# Initialize models
vae = VAE(input_dim=X_gene_train.shape[1], latent_dim=10)  # VAE for gene data
classifier_gene = ClassifierWithVariationalDropout(input_dim=10, output_dim=2)  # Classifier for VAE latent space
classifier_clinical = ClassifierWithVariationalDropout(input_dim=X_clinical_train.shape[1], output_dim=2)  # Classifier for clinical data

# Loss functions and optimizers
vae_optimizer = torch.optim.Adam(list(vae.parameters()) + list(classifier_gene.parameters()), lr=0.1)
classifier_optimizer = torch.optim.Adam(classifier_clinical.parameters(), lr=0.0001)


# Training Loop for VAE + Classifier on Gene Data
for epoch in range(100):  # Adjust epochs as needed
    vae.train()
    classifier_gene.train()
    for batch_idx, (data, target) in enumerate(train_loader_gene):
        vae_optimizer.zero_grad()

        # Forward pass through VAE and classifier
        recon_batch, mu, logvar = vae(data)
        output = classifier_gene(mu)  # Use latent space representation

        # Compute VAE loss and classification loss
        vae_loss = vae.loss_function(recon_batch, data, mu, logvar)
        classification_loss = torch.nn.functional.cross_entropy(output, target)

        # Total loss
        loss = vae_loss + classification_loss
        loss.backward()
        vae_optimizer.step()

    print(f"Epoch {epoch} | VAE Loss: {vae_loss.item()} | Classification Loss: {classification_loss.item()}")

print('##---------------------------------------------------------##')

# Training Loop for Classifier on Clinical Data
for epoch in range(100):  # Adjust epochs as needed
    classifier_clinical.train()
    for batch_idx, (data, target) in enumerate(train_loader_clinical):
        classifier_optimizer.zero_grad()

        # Forward pass through classifier
        output = classifier_clinical(data)

        # Compute classification loss
        classification_loss = torch.nn.functional.cross_entropy(output, target)

        classification_loss.backward()
        classifier_optimizer.step()

    print(f"Epoch {epoch} | Clinical Classification Loss: {classification_loss.item()}")



Epoch 0 | VAE Loss: 1121.533447265625 | Classification Loss: 0.0
Epoch 1 | VAE Loss: 963.0816040039062 | Classification Loss: 0.0
Epoch 2 | VAE Loss: 1478.345947265625 | Classification Loss: 0.0
Epoch 3 | VAE Loss: 1292.28125 | Classification Loss: 0.0
Epoch 4 | VAE Loss: 899.00341796875 | Classification Loss: 0.0
Epoch 5 | VAE Loss: 819.0283813476562 | Classification Loss: 0.0
Epoch 6 | VAE Loss: 771.4208374023438 | Classification Loss: 0.0
Epoch 7 | VAE Loss: 523.0353393554688 | Classification Loss: 0.0
Epoch 8 | VAE Loss: 350.8897705078125 | Classification Loss: 0.0
Epoch 9 | VAE Loss: 298.42315673828125 | Classification Loss: 0.0
Epoch 10 | VAE Loss: 315.5010681152344 | Classification Loss: 0.0
Epoch 11 | VAE Loss: 749.5999755859375 | Classification Loss: 0.0
Epoch 12 | VAE Loss: 687.9379272460938 | Classification Loss: 0.0
Epoch 13 | VAE Loss: 602.1236572265625 | Classification Loss: 0.0
Epoch 14 | VAE Loss: 472.87994384765625 | Classification Loss: 0.0
Epoch 15 | VAE Loss: 374.14

In [None]:
# Function for pseudo-labeling
def pseudo_labeling(model_gene, model_clinical, unlabeled_data_gene, unlabeled_data_clinical, threshold=0.8):
    model_gene.eval()
    model_clinical.eval()

    # Pseudo-labeling for gene model
    with torch.no_grad():
        gene_preds = model_gene(unlabeled_data_gene)
        confidence_gene, predicted_gene = torch.max(gene_preds, 1)
        confident_gene_indices = confidence_gene > threshold

    # Pseudo-labeling for clinical model
    with torch.no_grad():
        clinical_preds = model_clinical(unlabeled_data_clinical)
        confidence_clinical, predicted_clinical = torch.max(clinical_preds, 1)
        confident_clinical_indices = confidence_clinical > threshold

    # Augment the labeled data with the confident pseudo-labels
    pseudo_labeled_gene = unlabeled_data_gene[confident_gene_indices]
    pseudo_labeled_clinical = unlabeled_data_clinical[confident_clinical_indices]

    return pseudo_labeled_gene, pseudo_labeled_clinical
