# TCGA Model training
---

Experimenting training models on the preprocessed the TCGA dataset from the Pancancer paper (https://www.ncbi.nlm.nih.gov/pubmed/29625048) into a single, clean dataset.

The Cancer Genome Atlas (TCGA), a landmark cancer genomics program, molecularly characterized over 20,000 primary cancer and matched normal samples spanning 33 cancer types. This joint effort between the National Cancer Institute and the National Human Genome Research Institute began in 2006, bringing together researchers from diverse disciplines and multiple institutions.

## Importing the necessary packages

In [None]:
import os                                  # os handles directory/workspace changes
import torch                               # PyTorch to create and apply deep learning models
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, log_loss, roc_auc_score
import sys

In [None]:
# Path to the dataset files
data_path = 'cleaned/'

In [None]:
import modin.pandas as pd                  # Optimized distributed version of Pandas

Allow pandas to show more columns:

In [None]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

Set the random seed for reproducibility:

In [None]:
import numpy as np
import pandas
np.random.seed(42)

## Loading the data

In [None]:
tcga_df = pandas.read_csv(f'{data_path}normalized/tcga.csv')
tcga_df.head()

In [None]:
tcga_df.info()

In [None]:
tcga_df.participant_id.value_counts()

In [None]:
tcga_df.dtypes

Remove the original string ID column and use the numeric one instead:

In [None]:
tcga_df = tcga_df.drop(columns=['participant_id'], axis=1)
tcga_df = tcga_df.rename(columns={'Unnamed: 0': 'sample_id'})
tcga_df.head()

Convert the label to a numeric format:

In [None]:
tcga_df.tumor_type_label.value_counts()

In [None]:
# Encode the categorical labels into numeric values
tcga_df['tumor_type_label'], label_array = pandas.factorize(tcga_df['tumor_type_label'])

# Create a dictionary mapping integer labels to original string labels
label_dict = {i: label for i, label in enumerate(label_array)}

# Show value counts of the encoded labels
tcga_df['tumor_type_label'].value_counts()


In [None]:
label_dict

In [None]:
tcga_df.dtypes

Convert to a PyTorch tensor:

In [None]:
tcga_df.dtypes[tcga_df.dtypes == 'object']
for col in tcga_df.select_dtypes(include='object').columns:
    tcga_df[col] = pandas.factorize(tcga_df[col])[0]

In [None]:
tcga_df = tcga_df.astype(np.float32)  # or np.float64 if preferred


In [None]:
tcga_tsr = torch.from_numpy(tcga_df.to_numpy())
tcga_tsr

Create a dataset:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

# Step 1: Filter out low-frequency classes (fewer than 10 samples)
min_class_samples = 10
class_counts = tcga_df['tumor_type_label'].value_counts()
valid_classes = class_counts[class_counts >= min_class_samples].index.tolist()
filtered_df = tcga_df[tcga_df['tumor_type_label'].isin(valid_classes)].copy()

# Step 2: Re-encode labels to 0..K format
filtered_df['tumor_type_label'], label_array = pandas.factorize(filtered_df['tumor_type_label'])
label_dict = {i: label for i, label in enumerate(label_array)}

# Step 3: Convert object columns to numeric
for col in filtered_df.select_dtypes(include='object').columns:
    filtered_df[col] = pandas.factorize(filtered_df[col])[0]

# Step 4: Convert to float32
filtered_df = filtered_df.astype(np.float32)

# Step 5: Extract features and labels
X_all = filtered_df.drop(columns=['tumor_type_label']).to_numpy()
y_all = filtered_df['tumor_type_label'].to_numpy().astype(int)

# Step 6: Stratified split into train/cal/test (70/10/20)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all)

X_train, X_cal, y_temp_train, y_temp_cal = train_test_split(
    X_temp, y_temp, test_size=0.3, random_state=42, stratify=y_temp)  # 0.125 of 0.8 = 10%

# Step 7: Re-encode training labels and align others to the same mapping
y_train, label_array = pandas.factorize(y_temp_train)
label_dict = {i: label for i, label in enumerate(label_array)}
label_map = {v: i for i, v in enumerate(label_array)}

y_cal = np.array([label_map[y] for y in y_temp_cal])
y_test = np.array([label_map[y] for y in y_test if y in label_map])
X_test = X_test[:len(y_test)]
X_cal = X_cal[:len(y_cal)]

# Step 8: Apply Truncated SVD to reduce dimensionality
svd = TruncatedSVD(n_components=10, random_state=42)
X_train_reduced = svd.fit_transform(X_train)
X_cal_reduced = svd.transform(X_cal)
X_test_reduced = svd.transform(X_test)


In [None]:
def get_batches(X, y, batch_size):
    for i in range(0, len(X), batch_size):
        X_batch = torch.tensor(X[i:i+batch_size]).float()
        y_batch = torch.tensor(y[i:i+batch_size]).long()
        yield X_batch, y_batch


## Training models

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


In [None]:
# Initialize LDA
lda_model = LinearDiscriminantAnalysis()

# Train LDA
lda_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = lda_model.predict(X_test)
y_proba = lda_model.predict_proba(X_test)

lda_acc = accuracy_score(y_test, y_pred)

print(f"LDA → Accuracy: {lda_acc:.4f}")

lda_model.fit(X_train_reduced, y_train)
y_pred = lda_model.predict(X_test_reduced)
lda_acc = accuracy_score(y_test, y_pred)

print(f"LDA with Dimensionality Reduction → Accuracy: {lda_acc:.4f}")

In [None]:
# Initialize QDA
qda_model = QuadraticDiscriminantAnalysis()

# Train QDA
qda_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = qda_model.predict(X_test)
y_proba = qda_model.predict_proba(X_test)

qda_acc = accuracy_score(y_test, y_pred)

print(f"QDA → Accuracy: {qda_acc:.4f}")

qda_model.fit(X_train_reduced, y_train)
y_pred = qda_model.predict(X_test_reduced)
qda_acc = accuracy_score(y_test, y_pred)

print(f"QDA with Dimensionality Reduction → Accuracy: {qda_acc:.4f}")

# CONFORMAL PREDICTION

In [None]:
from mapie.classification import SplitConformalClassifier
from mapie.metrics.classification import classification_coverage_score, classification_mean_width_score


In [None]:
alpha = 0.01

# Create SplitConformalClassifier wrapper
mapie_lda = SplitConformalClassifier(
    estimator=lda_model,
    conformity_score="aps",
    confidence_level=1 - alpha, 
    prefit=True,
    random_state=42
)

# Calibrate using reduced calibration set
mapie_lda.conformalize(X_cal_reduced, y_cal)

# Predict conformal prediction sets on test set
_, y_pset_lda = mapie_lda.predict_set(X_test_reduced)

# Evaluate coverage and prediction set width
coverage_lda = classification_coverage_score(y_test, y_pset_lda)
width_lda = classification_mean_width_score(y_pset_lda)

print(f"LDA (APS, alpha={alpha:.2f}) → Coverage: {coverage_lda.item():.4f}, Avg Set Size: {width_lda.item():.2f}")



In [None]:
# Create SplitConformalClassifier wrapper
mapie_qda = SplitConformalClassifier(
    estimator=qda_model,
    conformity_score="aps",
    confidence_level=1 - alpha, 
    prefit=True,
    random_state=42
)

# Calibrate using reduced calibration set
mapie_qda.conformalize(X_cal_reduced, y_cal)

# Predict conformal prediction sets on test set
_, y_pset_qda = mapie_qda.predict_set(X_test_reduced)

# Evaluate coverage and prediction set width
coverage_qda = classification_coverage_score(y_test, y_pset_qda)
width_qda = classification_mean_width_score(y_pset_qda)

print(f"QDA (APS, alpha={alpha:.2f}) → Coverage: {coverage_qda.item():.4f}, Avg Set Size: {width_qda.item():.2f}")



In [None]:
print("Number of unique labels in y_test:", len(np.unique(y_test)))
print("Labels in y_test:", np.unique(y_test))


In [None]:
def brier_score(p, y_true):
    y_onehot = np.zeros_like(p)
    y_onehot[np.arange(len(y_true)), y_true] = 1
    return np.mean(np.sum((y_onehot - p)**2, axis=1))

def spherical_score(p, y_true):
    norms = np.linalg.norm(p, axis=1)
    return np.mean([p[i, y_true[i]] / norms[i] for i in range(len(y_true))])

def log_score(p, y_true):
    return -np.mean(np.log(p[np.arange(len(y_true)), y_true] + 1e-15))

### For LDA
probs_lda = lda_model.predict_proba(X_test_reduced)
logloss_lda = log_score(probs_lda, y_test)
brier_lda = brier_score(probs_lda, y_test)
spherical_lda = spherical_score(probs_lda, y_test)

print("\nLDA Proper Scoring Rules:")
print(f"Log Loss:        {logloss_lda:.4f}")
print(f"Brier Score:     {brier_lda:.4f}")
print(f"Spherical Score: {spherical_lda:.4f}")

### For QDA
probs_qda = qda_model.predict_proba(X_test_reduced)
logloss_qda = log_score(probs_qda, y_test)
brier_qda = brier_score(probs_qda, y_test)
spherical_qda = spherical_score(probs_qda, y_test)

print("\nQDA Proper Scoring Rules:")
print(f"Log Loss:        {logloss_qda:.4f}")
print(f"Brier Score:     {brier_qda:.4f}")
print(f"Spherical Score: {spherical_qda:.4f}")
