## Import

In [88]:
!pip install lifelines
!pip install scikit-learn
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.regularizers import l1_l2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score



## Read Data

In [89]:
from google.colab import drive
import os

# connect to goodle drive
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks') # to import our custom module from this path later

read_path = '/content/drive/My Drive/AIIM/Final/SSL data/' # modify this line according to your path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [90]:
import numpy as np
import pandas as pd

TRAIN_DATA_PATH = read_path + 'train_data.csv'
VAL_DATA_PATH = read_path + 'val_data.csv'
UNLABELED_DATA_PATH = read_path + 'unlabelled_data.csv'

train_data = pd.read_csv(TRAIN_DATA_PATH)
val_data = pd.read_csv(VAL_DATA_PATH)
unlabeled_data = pd.read_csv(UNLABELED_DATA_PATH)

train_data.columns

Index(['ESR1', 'PGR', 'ERBB2', 'MKI67', 'PLAU', 'ELAVL1', 'EGFR', 'BTRC',
       'FBXO6', 'SHMT2', 'KRAS', 'SRPK2', 'YWHAQ', 'PDHA1', 'EWSR1', 'ZDHHC17',
       'ENO1', 'DBN1', 'PLK1', 'GSK3B', 'Age', 'Menopausal State', 'Size',
       'Radio Therapy', 'Chemotherapy', 'Hormone Therapy',
       'Neoplasm Histologic Grade', 'Cellularity', 'Surgery-breast conserving',
       'Surgery-mastectomy', 'Label', 'DssTime', 'Event', 'auto_id'],
      dtype='object')

In [91]:
# Data Preprocessing

x_train = train_data.drop(columns=['Label', 'DssTime', 'Event'])
y_train = train_data['Label']

x_test = val_data.drop(columns=['Label', 'DssTime', 'Event'])
y_test = val_data['Label']

x_ul = unlabeled_data

# Add 'ID' column
# x_train.insert(0, 'ID', range(1, len(x_train) + 1))
# x_ul.insert(0, 'ID', range(len(x_train) + 1, len(x_train) + len(x_ul) + 1))

In [None]:
x_test

Unnamed: 0,ESR1,PGR,ERBB2,MKI67,PLAU,ELAVL1,EGFR,BTRC,FBXO6,SHMT2,...,Menopausal State,Size,Radio Therapy,Chemotherapy,Hormone Therapy,Neoplasm Histologic Grade,Cellularity,Surgery-breast conserving,Surgery-mastectomy,auto_id
0,6.407398,5.332991,11.028405,5.570715,7.884060,6.170632,6.524852,6.087229,7.607153,9.709837,...,1,35,0,0,1,2,0.5,0,1,1112
1,5.359227,5.233657,11.295086,5.467575,7.806072,7.934447,6.342535,6.249445,6.337625,9.996191,...,1,45,1,0,0,2,0.5,1,0,1091
2,5.874314,5.226781,10.052359,6.345439,8.733782,7.140753,7.271026,6.408314,7.436915,11.848892,...,1,45,0,1,0,3,1.0,0,1,1215
3,11.050258,5.361671,9.424651,6.489388,8.299958,6.422778,5.706520,6.598701,7.400819,10.873850,...,1,21,1,1,1,2,1.0,1,0,1438
4,11.424278,5.372667,10.864729,6.410957,6.787171,6.926255,5.685226,7.046461,6.946620,9.987552,...,1,30,1,1,1,2,0.5,1,0,1455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,7.031565,5.473497,13.475461,5.726077,9.473292,6.121501,5.446935,6.072814,7.663872,10.869404,...,0,25,1,1,0,2,1.0,1,0,1355
89,5.743862,5.279866,10.985029,6.166024,9.326726,7.260007,7.229502,6.019048,8.364236,10.849398,...,0,10,1,0,0,3,0.5,0,1,1196
90,6.204958,5.172111,8.881671,5.861609,8.530361,6.671294,11.724683,6.046692,7.401715,10.481299,...,1,39,1,1,0,3,0.0,0,1,1004
91,11.916582,7.464985,11.086953,5.908589,9.729562,6.260820,5.740201,6.800034,7.160187,10.163748,...,1,28,1,0,0,2,1.0,1,0,1230


In [None]:
print('x_train: ', x_train.shape)
print('x_test: ', x_test.shape)
print('x_ul: ', x_ul.shape)

x_train:  (372, 31)
x_test:  (93, 31)
x_ul:  (1168, 31)


## Semi-Supervised Learning (SSL)

###Basic SSL with 10 Loops Limitation

**Model**
```
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])
```

**Explination**
- `("scaler", StandardScaler())`
    
    Standardizes the features

- `("clf", RandomForestClassifier(n_estimators=100, random_state=42))`

  - Random Forest Classifier
  - `n_estimators=100`: Specifies the number of decision trees in the forest.
  - `random_state=42`: Ensures reproducibility of results by fixing the random seed.


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Combine gene expression and clinical data
def combine_features(gene_data, clinical_data):
    return np.hstack((gene_data, clinical_data))

# Convert labels from True/False to 1/0
y_train = np.array([1 if label else 0 for label in y_train])

# Combine labeled data
X = combine_features(x_train, c_train)
y = y_train

# Split labeled data into a training set and a test set
X_train, X_test, y_train_split, y_test_split = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create a pipeline with scaling and a classifier
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Self-training
max_iterations = 10
confidence_threshold = 0.9
X_labeled = X_train.copy()
y_labeled = y_train_split.copy()
X_unlabeled = combine_features(x_ul, c_ul)
pseudo_labels = []

for iteration in range(max_iterations):
    print(f"Iteration {iteration + 1}...")

    # Train the model on the labeled dataset
    base_model.fit(X_labeled, y_labeled)

    # Predict probabilities on the unlabeled dataset
    probs = base_model.predict_proba(X_unlabeled)
    pseudo_labels = np.argmax(probs, axis=1)  # Predicted labels
    confidence_scores = np.max(probs, axis=1)  # Max probabilities

    # Select confident predictions
    confident_indices = np.where(confidence_scores >= confidence_threshold)[0]
    if len(confident_indices) == 0:
        print("No confident predictions in this iteration. Stopping...")
        break

    # Add confident predictions to the labeled dataset
    X_labeled = np.vstack((X_labeled, X_unlabeled[confident_indices]))
    y_labeled = np.hstack((y_labeled, pseudo_labels[confident_indices]))
    X_unlabeled = np.delete(X_unlabeled, confident_indices, axis=0)

    print(f"Added {len(confident_indices)} pseudo-labeled samples.")

# Evaluate on split test set
y_test_pred = base_model.predict(X_test)
test_accuracy = accuracy_score(y_test_split, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Combine original and pseudo-labeled data
labeled_data = pd.DataFrame(np.hstack((X_train, y_train_split.reshape(-1, 1))),
                            columns=[*gene_name, *clinical_feature, 'Label'])

pseudo_labeled_data = pd.DataFrame(
    np.hstack((X_labeled[len(X_train):], y_labeled[len(X_train):].reshape(-1, 1))),
    columns=[*gene_name, *clinical_feature, 'Label']
)

# Ensure labels are stored as 0/1
labeled_data['Label'] = labeled_data['Label'].astype(int)
pseudo_labeled_data['Label'] = pseudo_labeled_data['Label'].astype(int)

combined_data = pd.concat([labeled_data, pseudo_labeled_data], ignore_index=True)

# Save to CSV
output_file = "/content/drive/My Drive/AIIM/Final/combined_labeled_data.csv"
combined_data.to_csv(output_file, index=False)
print(f"Combined labeled data saved to {output_file}")


Iteration 1...
Added 18 pseudo-labeled samples.
Iteration 2...
Added 15 pseudo-labeled samples.
Iteration 3...
Added 13 pseudo-labeled samples.
Iteration 4...
Added 14 pseudo-labeled samples.
Iteration 5...
Added 21 pseudo-labeled samples.
Iteration 6...
Added 25 pseudo-labeled samples.
Iteration 7...
Added 29 pseudo-labeled samples.
Iteration 8...
Added 33 pseudo-labeled samples.
Iteration 9...
Added 25 pseudo-labeled samples.
Iteration 10...
Added 16 pseudo-labeled samples.
Test Accuracy: 0.6989
Combined labeled data saved to /content/drive/My Drive/AIIM/Final/combined_labeled_data.csv


###Random Forest with No Loop Limitation (USE THIS ONE)

****
**Model**
```
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])
```

**Explination**
- `("scaler", StandardScaler())`
    
    - Standardizes the features

- `("clf", RandomForestClassifier(n_estimators=100, random_state=42))`

  - Random Forest Classifier
  - `n_estimators=100`: Specifies the number of decision trees in the forest.
  - `random_state=42`: Ensures reproducibility of results by fixing the random seed.

****

**Method**

Train an initail model using labeled data to predict the label for unlabeled data. Add those with high confidence to labeled data and re-train the model. Iterate until no more data meet the confidence threshold.

The remaining unlabeled data would be added to the final dataset too, with confidence set to 0.5 (or the real confidence).

****

In [92]:
# Method for semi-supervised learning
def semi_supervised_learning(x_train, y_train, x_ul, base_model, confidence_threshold=0.9):
    x_train = x_train.copy()
    x_train["Confidence"] = 1.0

    y_train = y_train.copy()
    x_ul = x_ul.copy()

    iteration = 0
    while not x_ul.empty:
        # Train the model on labeled data
        base_model.fit(x_train.drop(columns=["Confidence", "auto_id"]), y_train)

        # Predict probabilities for unlabeled data
        probas = base_model.predict_proba(x_ul.drop(columns=["auto_id"]))
        predictions = base_model.predict(x_ul.drop(columns=["auto_id"]))

        # Select high-confidence predictions
        max_probas = np.max(probas, axis=1)
        high_conf_indices = np.where(max_probas >= confidence_threshold)[0]

        if len(high_conf_indices) == 0:
            print(f"No more high-confidence predictions at iteration {iteration}.")
            break

        # Add high-confidence predictions to the labeled dataset
        x_high_conf = x_ul.iloc[high_conf_indices]
        y_high_conf = predictions[high_conf_indices]
        high_conf_values = max_probas[high_conf_indices]

        # x_high_conf.loc["Confidence"] = high_conf_values
        x_high_conf = pd.concat([x_high_conf, pd.Series(high_conf_values, index=x_high_conf.index, name="Confidence")], axis=1)

        x_train = pd.concat([x_train, x_high_conf], axis=0)
        y_train = pd.concat([y_train, pd.Series(y_high_conf, index=x_high_conf.index)], axis=0)

        # Remove high-confidence data from the unlabeled dataset
        x_ul = x_ul.drop(index=x_high_conf.index)

        iteration += 1
        print(f"Iteration {iteration}: Added {len(high_conf_indices)} samples to labeled data.")

    # Add remaining unlabeled data with their final predicted probabilities
    if not x_ul.empty:
        # final_probas = base_model.predict_proba(x_ul.drop(columns=["auto_id"]))
        final_predictions = base_model.predict(x_ul.drop(columns=["auto_id"]))
        # max_confidences = np.max(final_probas, axis=1)

        # Combine the data, labels, and confidence into a single DataFrame
        remaining_data = x_ul.copy()
        remaining_data["Label"] = final_predictions
        remaining_data["Confidence"] = 0.5
        # remaining_data["Confidence"] = max_confidences

    else:
        remaining_data = pd.DataFrame(columns=list(x_ul.columns) + ["Label", "Confidence"])

    # Combine the (psuedo) labeled data with remaining data

    x_train["Label"] = y_train
    pseudo_labeled_data = pd.concat([x_train, remaining_data], axis=0)

    return base_model, pseudo_labeled_data

In [93]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split labeled data into a training set and a test set
# x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(
#     x_train, y_train, test_size=0.2, random_state=42, stratify=y
# )

# Create a pipeline with scaling and a classifier
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Run semi-supervised learning
model, pseudo_labeled_data = semi_supervised_learning(
    x_train, y_train, x_ul, base_model, confidence_threshold=0.83
)

Iteration 1: Added 93 samples to labeled data.
Iteration 2: Added 140 samples to labeled data.
Iteration 3: Added 134 samples to labeled data.
Iteration 4: Added 106 samples to labeled data.
Iteration 5: Added 82 samples to labeled data.
Iteration 6: Added 64 samples to labeled data.
Iteration 7: Added 31 samples to labeled data.
Iteration 8: Added 29 samples to labeled data.
Iteration 9: Added 18 samples to labeled data.
Iteration 10: Added 25 samples to labeled data.
Iteration 11: Added 15 samples to labeled data.
Iteration 12: Added 13 samples to labeled data.
Iteration 13: Added 19 samples to labeled data.
Iteration 14: Added 17 samples to labeled data.
Iteration 15: Added 8 samples to labeled data.
Iteration 16: Added 6 samples to labeled data.
Iteration 17: Added 4 samples to labeled data.
Iteration 18: Added 1 samples to labeled data.
Iteration 19: Added 2 samples to labeled data.
No more high-confidence predictions at iteration 19.


In [94]:
# Predict for the testing dataset
y_test_predictions = model.predict(x_test.drop(columns=["auto_id"]))
y_test_probas = model.predict_proba(x_test.drop(columns=["auto_id"]))
max_confidences = np.max(y_test_probas, axis=1)

test_data = x_test.copy()
test_data["Label"] = y_test_predictions
# test_data["Confidence"] = max_confidences
test_data["Confidence"] = np.where(max_confidences < 0.83, 0.5, max_confidences)

# Concat it with 'pseudo_labeled_data'
pseudo_labeled_data = pd.concat([pseudo_labeled_data, test_data], axis=0)

In [98]:
# Concat x_train and y_train column-wise
# pseudo_labeled_data = pd.concat([x_final_train, y_final_train], axis=1)

# Sort data by 'ID' column
pseudo_labeled_data = pseudo_labeled_data.sort_values(by='auto_id')

# Reset the index of the DataFrame
pseudo_labeled_data = pseudo_labeled_data.reset_index(drop=True)

# Delete the 'ID' column
# pseudo_labeled_data = pseudo_labeled_data.drop(columns=['auto_id'])



In [99]:
pseudo_labeled_data

Unnamed: 0,ESR1,PGR,ERBB2,MKI67,PLAU,ELAVL1,EGFR,BTRC,FBXO6,SHMT2,...,Radio Therapy,Chemotherapy,Hormone Therapy,Neoplasm Histologic Grade,Cellularity,Surgery-breast conserving,Surgery-mastectomy,auto_id,Confidence,Label
0,10.041281,7.376123,9.725825,5.427919,9.300307,6.219375,6.125355,5.888779,7.893369,9.007343,...,0,1,1,2.0,1.0,0.0,1.0,1000,1.00,0
1,11.276581,7.331223,9.956267,5.629876,8.119906,5.665620,5.775809,6.251167,8.242063,10.871432,...,1,1,1,3.0,1.0,0.0,1.0,1001,1.00,1
2,7.536847,5.587666,11.514514,5.722951,6.741081,6.321480,5.466188,6.956486,7.673015,9.837096,...,0,0,0,2.0,1.0,0.0,1.0,1002,0.50,1
3,10.395644,6.531288,9.075396,5.440774,7.861422,5.973844,5.757120,6.026611,7.666777,9.455256,...,1,0,1,2.0,0.5,1.0,0.0,1003,0.96,0
4,6.204958,5.172111,8.881671,5.861609,8.530361,6.671294,11.724683,6.046692,7.401715,10.481299,...,1,1,0,3.0,0.0,0.0,1.0,1004,0.97,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,11.628490,5.570690,10.475695,6.032211,9.944405,5.865408,5.703147,6.649948,7.272166,9.750208,...,0,0,1,3.0,1.0,0.0,1.0,3163,0.83,0
1629,10.879891,6.431113,10.219154,5.435795,9.224122,5.699195,5.825643,6.404899,7.385644,9.271953,...,0,0,1,3.0,1.0,0.0,1.0,3164,0.89,0
1630,9.591235,7.984515,9.935179,5.605596,9.799519,5.808704,5.905282,6.491419,7.865526,9.741103,...,1,0,1,3.0,1.0,1.0,0.0,3165,0.83,0
1631,11.055114,8.282737,9.892589,5.753274,8.687667,5.475813,5.587906,6.830579,8.468221,9.482622,...,0,0,1,2.0,0.5,0.0,1.0,3166,0.85,0


In [100]:
count_equal_05 = (pseudo_labeled_data['Confidence'] == 0.5).sum()
count_equal_05

411

In [101]:
# Save to CSV
output_file = "/content/drive/My Drive/AIIM/Final/SSL data/pseudo_labeled_rf_full_fixed_conf_new.csv"
pseudo_labeled_data.to_csv(output_file, index=False)

### Gradient Boosting - XGBoost

- with Hyperparameter Tuning
- the result is terrible haha
- dump!

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from scipy.stats import uniform, randint

# Combine gene expression and clinical data
def combine_features(gene_data, clinical_data):
    return np.hstack((gene_data, clinical_data))

# Convert labels from True/False to 1/0
y_train = np.array([1 if label else 0 for label in y_train])

# Combine labeled data
X = combine_features(x_train, c_train)
y = y_train

# Split labeled data into a training set and a test set
X_train, X_test, y_train_split, y_test_split = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Prepare the unlabeled data
X_unlabeled = combine_features(x_ul, c_ul)
X_unlabeled = scaler.transform(X_unlabeled)

# Define the hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 500),               # Number of boosting rounds
    'max_depth': randint(3, 10),                      # Depth of trees
    'learning_rate': uniform(1e-3, 1e-1),              # Learning rate
    'subsample': uniform(0.6, 0.2),                   # Subsample ratio
    'colsample_bytree': uniform(0.6, 0.2),            # Column sampling
    'scale_pos_weight': uniform(1, 10),               # Class imbalance weight
}

# Initialize the XGBoost model
model = XGBClassifier(
    # use_label_encoder=False,
    eval_metric="logloss"
)

# Set up RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter settings to try
    scoring='accuracy',
    cv=3,        # 3-fold cross-validation
    random_state=42,
    verbose=2
)

# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train, y_train_split)

# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Output the best parameters
print("Best parameters found: ", best_params)

# Self-training
confidence_threshold = 0.9
X_labeled = X_train.copy()
y_labeled = y_train_split.copy()
confidence_labeled = np.ones(len(y_labeled))  # Confidence for labeled data is 1
pseudo_labels = []
confidence_scores_pseudo = []

total_added = 0  # Counter to track total number of pseudo-labeled data added

while True:
    print("Starting new iteration...")

    # Train the best model on the labeled dataset
    best_model.fit(X_labeled, y_labeled)

    # Predict probabilities on the unlabeled dataset
    probs = best_model.predict_proba(X_unlabeled)
    pseudo_labels = np.argmax(probs, axis=1)  # Predicted labels
    confidence_scores = np.max(probs, axis=1)  # Max probabilities

    # Select confident predictions
    confident_indices = np.where(confidence_scores >= confidence_threshold)[0]
    if len(confident_indices) == 0:
        print("No confident predictions left. Stopping...")
        break

    # Add confident predictions to the labeled dataset
    X_labeled = np.vstack((X_labeled, X_unlabeled[confident_indices]))
    y_labeled = np.hstack((y_labeled, pseudo_labels[confident_indices]))
    confidence_labeled = np.hstack((confidence_labeled, confidence_scores[confident_indices]))
    X_unlabeled = np.delete(X_unlabeled, confident_indices, axis=0)

    total_added += len(confident_indices)  # Update the total added counter

    print(f"Added {len(confident_indices)} pseudo-labeled samples.")

# Output the total number of pseudo-labeled data added
print(f"Total pseudo-labeled data added: {total_added}")

# Combine original and pseudo-labeled data
labeled_data = pd.DataFrame(np.hstack((X_train, y_train_split.reshape(-1, 1), np.ones((len(y_train_split), 1)))),
                            columns=[*gene_name, *clinical_feature, 'Label', 'Confidence'])

pseudo_labeled_data = pd.DataFrame(
    np.hstack((X_labeled[len(X_train):], y_labeled[len(X_train):].reshape(-1, 1), confidence_labeled[len(X_train):].reshape(-1, 1))),
    columns=[*gene_name, *clinical_feature, 'Label', 'Confidence']
)

# Ensure labels are stored as 0/1
labeled_data['Label'] = labeled_data['Label'].astype(int)
pseudo_labeled_data['Label'] = pseudo_labeled_data['Label'].astype(int)

combined_data = pd.concat([labeled_data, pseudo_labeled_data], ignore_index=True)

# Save to CSV
output_file = "/content/drive/My Drive/AIIM/Final/combined_labeled_data_xgboost_tuned.csv"
combined_data.to_csv(output_file, index=False)
print(f"Combined labeled data saved to {output_file}")


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.6749080237694725, learning_rate=0.09607143064099162, max_depth=5, n_estimators=171, scale_pos_weight=6.986584841970366, subsample=0.6312037280884872; total time=   0.5s
[CV] END colsample_bytree=0.6749080237694725, learning_rate=0.09607143064099162, max_depth=5, n_estimators=171, scale_pos_weight=6.986584841970366, subsample=0.6312037280884872; total time=   1.5s
[CV] END colsample_bytree=0.6749080237694725, learning_rate=0.09607143064099162, max_depth=5, n_estimators=171, scale_pos_weight=6.986584841970366, subsample=0.6312037280884872; total time=   3.2s
[CV] END colsample_bytree=0.6311989040672406, learning_rate=0.006808361216819946, max_depth=7, n_estimators=199, scale_pos_weight=2.428668179219408, subsample=0.7301776945897706; total time=   0.8s
[CV] END colsample_bytree=0.6311989040672406, learning_rate=0.006808361216819946, max_depth=7, n_estimators=199, scale_pos_weight=2.428668179219408, 

### other stuff

In [None]:

# # Self-training
# confidence_threshold = 0.9

# X_labeled = X_train.copy()
# X_labeled = np.hstack((np.arange(1, X_labeled.shape[0] + 1).reshape(-1, 1), X_labeled)) # add the row number as first col (start from 1)
# y_labeled = y_train_split.copy()

# X_unlabeled = combine_features(x_ul, c_ul)
# X_unlabeled = np.hstack((np.arange(X_labeled.shape[0] + 1, X_unlabeled.shape[0] + X_labeled.shape[0] + 1).reshape(-1, 1), X_unlabeled)) # add the row number as first col (following labeled data)

# confidence_scores_labeled = np.ones(len(y_labeled))  # Confidence for labeled data is 1
# pseudo_labels = []
# confidence_scores_pseudo = []

# total_added = 0  # Counter to track total number of pseudo-labeled data added

# while True:
#     print("Starting new iteration...")

#     # Train the model on the labeled dataset
#     base_model.fit(X_labeled, y_labeled)

#     # Predict probabilities on the unlabeled dataset
#     probs = base_model.predict_proba(X_unlabeled)
#     pseudo_labels = np.argmax(probs, axis=1)  # Predicted labels
#     confidence_scores = np.max(probs, axis=1)  # Max probabilities

#     # Select confident predictions
#     confident_indices = np.where(confidence_scores >= confidence_threshold)[0]
#     if len(confident_indices) == 0:
#         print("No confident predictions left.")

#         #TODO: deal with these data somehow
#         X_labeled = np.vstack((X_labeled, X_unlabeled))
#         y_labeled = np.hstack((y_labeled, pseudo_labels))
#         confidence_scores_labeled = np.hstack((confidence_scores_labeled, confidence_scores))

#         break

#     # Add confident predictions to the labeled dataset
#     X_labeled = np.vstack((X_labeled, X_unlabeled[confident_indices]))
#     y_labeled = np.hstack((y_labeled, pseudo_labels[confident_indices]))
#     confidence_scores_labeled = np.hstack((confidence_scores_labeled, confidence_scores[confident_indices]))
#     X_unlabeled = np.delete(X_unlabeled, confident_indices, axis=0)

#     total_added += len(confident_indices)  # Update the total added counter

#     print(f"Added {len(confident_indices)} pseudo-labeled samples.")

# # Output the total number of pseudo-labeled data added
# print(f"Data: {X_labeled.shape}")

# # Combine original and pseudo-labeled data
# # labeled_data = pd.DataFrame(np.hstack((X_train, y_train_split.reshape(-1, 1), np.ones((len(y_train_split), 1)))),
# #                             columns=[*gene_name, *clinical_feature, 'Label', 'Confidence'])

# # pseudo_labeled_data = pd.DataFrame(
# #     np.hstack((X_labeled[len(X_train):], y_labeled[len(X_train):].reshape(-1, 1), confidence_scores_labeled[len(X_train):].reshape(-1, 1))),
# #     columns=['ID', *gene_name, *clinical_feature, 'Label', 'Confidence']
# # )

# combined_data = pd.DataFrame(
#     np.hstack((X_labeled, y_labeled.reshape(-1, 1), confidence_scores_labeled.reshape(-1, 1))),
#     columns=['ID', *gene_name, *clinical_feature, 'Label', 'Confidence']
# )
# print('Combined Data:', combined_data.shape)

# # Sort data by 'ID' column
# combined_data = combined_data.sort_values(by='ID')
# # Delete the 'ID' column
# combined_data = combined_data.drop(columns=['ID'])

# # Ensure labels are stored as 0/1
# labeled_data['Label'] = labeled_data['Label'].astype(int)
# pseudo_labeled_data['Label'] = pseudo_labeled_data['Label'].astype(int)

# # Save to CSV
# output_file = "/content/drive/My Drive/AIIM/Final/pseudo_labeled_data_rf_full.csv"
# combined_data.to_csv(output_file, index=False)
# print(f"Combined labeled data saved to {output_file}")