In [1]:
# Then import the required libraries
import importlib
import numpy as np
import os

import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
seed = 0
np.random.seed(seed)

df = pd.read_csv('gene_level_methylation.csv')

print("Original columns:", df.columns.tolist()[:10], "...")

# Remove the "SingleValueType" column if it exists
if "SingleValueType" in df.columns:
    df = df.drop(columns=["SingleValueType"])
    print("Column 'SingleValueType' has been removed successfully.")
else:
    print("Column 'SingleValueType' does not exist in the DataFrame.")

# Sort columns and transpose
df = df.reindex(sorted(df.columns), axis=1)
df = df.T

# Set first row (originally "GeneSymbol") as new column names
df.columns = df.iloc[0]  # first row becomes column headers
df = df.drop(df.index[0])  # drop the first row

print("CNV Matrix After Transpose and Header Fix:", df.shape)
print("\nDataFrame preview:")
df.head()

Original columns: ['GeneSymbol', 'SingleValueType', 'TCGA-OL-A66H-01', 'TCGA-3C-AALK-01', 'TCGA-AC-A5EH-01', 'TCGA-EW-A2FW-01', 'TCGA-E9-A1R0-01', 'TCGA-AR-A1AJ-01', 'TCGA-AC-A62Y-01', 'TCGA-E9-A1QZ-01'] ...
Column 'SingleValueType' has been removed successfully.
CNV Matrix After Transpose and Header Fix: (785, 19277)

DataFrame preview:


GeneSymbol,RPS4Y2,RPS4Y1,TBL1Y,NLGN4Y,PRKY,PCDH11Y,TTTY20,GABRE,FTHL17,PORCN,...,EP300,DERL3,PLA2G3,TMPRSS6,MIR1281,GGT3P,UBE2L3,APOL5,SLC5A1,CDC42EP1
TCGA-3C-AAAU-01,0.528746,0.573773,0.440706,0.053398,0.117736,0.050879,0.51045,0.371732,0.877801,0.360481,...,0.0974,0.056637,0.806611,0.868405,0.0974,0.874078,0.31182,0.931256,0.186356,0.017439
TCGA-3C-AALI-01,0.651071,0.509548,0.41667,0.072804,0.806081,0.07175,0.485439,0.378231,0.757559,0.249911,...,0.060027,0.406407,0.78659,0.82704,0.060027,0.84627,0.78181,0.879644,0.899155,0.022864
TCGA-3C-AALJ-01,0.697286,0.56616,0.453496,0.068645,0.782943,0.051312,0.469256,0.324052,0.627746,0.466564,...,0.072913,0.508306,0.787246,0.848229,0.072913,0.809162,0.409583,0.894666,0.738759,0.030299
TCGA-3C-AALK-01,0.580953,0.575181,0.496185,0.327338,0.670718,0.242422,0.593454,0.527467,0.923862,0.408977,...,0.082873,0.569962,0.777228,0.841816,0.082873,0.836196,0.26714,0.937831,0.762713,0.026589
TCGA-4H-AAAK-01,0.474765,0.510691,0.440558,0.060931,0.642323,0.320909,0.454611,0.469725,0.916842,0.461385,...,0.078164,0.202899,0.696572,0.776501,0.078164,0.826303,0.242883,0.942489,0.7204,0.026004


In [3]:
df2=pd.read_csv('BRCA_clinicalMatrix.gz',compression='gzip',sep='\t',index_col=0)
df2.index = pd.Series({x: x.replace('-', '-') for x in df2.index})
print("Clinical Data Before:", df2.shape)
k=df2.columns.get_loc('PAM50Call_RNAseq')
df2=df2[df2.iloc[:,k].isna()==False]
df2.head()

Clinical Data Before: (1247, 202)


Unnamed: 0,AJCC_Stage_nature2012,Age_at_Initial_Pathologic_Diagnosis_nature2012,CN_Clusters_nature2012,Converted_Stage_nature2012,Days_to_Date_of_Last_Contact_nature2012,Days_to_date_of_Death_nature2012,ER_Status_nature2012,Gender_nature2012,HER2_Final_Status_nature2012,Integrated_Clusters_no_exp__nature2012,...,_GENOMIC_ID_TCGA_BRCA_mutation_wustl_gene,_GENOMIC_ID_TCGA_BRCA_miRNA_GA,_GENOMIC_ID_TCGA_BRCA_exp_HiSeqV2_percentile,_GENOMIC_ID_data/public/TCGA/BRCA/miRNA_GA_gene,_GENOMIC_ID_TCGA_BRCA_gistic2thd,_GENOMIC_ID_data/public/TCGA/BRCA/miRNA_HiSeq_gene,_GENOMIC_ID_TCGA_BRCA_G4502A_07_3,_GENOMIC_ID_TCGA_BRCA_exp_HiSeqV2,_GENOMIC_ID_TCGA_BRCA_gistic2,_GENOMIC_ID_TCGA_BRCA_PDMarray
TCGA-A1-A0SB-01,Stage I,70.0,1.0,Stage I,259.0,,Positive,FEMALE,Negative,,...,TCGA-A1-A0SB-01A-11D-A142-09,,a2405d64-34eb-4915-abf7-8530151d5cb0,,TCGA-A1-A0SB-01A-11D-A141-01,TCGA-A1-A0SB-01,,a2405d64-34eb-4915-abf7-8530151d5cb0,TCGA-A1-A0SB-01A-11D-A141-01,
TCGA-A1-A0SD-01,Stage IIA,59.0,2.0,Stage IIA,437.0,,Positive,FEMALE,Negative,,...,TCGA-A1-A0SD-01A-11D-A10Y-09,,15bad71d-3031-413b-9e8d-6426ae5dfbea,,TCGA-A1-A0SD-01A-11D-A111-01,TCGA-A1-A0SD-01,TCGA-A1-A0SD-01A-11R-A115-07,15bad71d-3031-413b-9e8d-6426ae5dfbea,TCGA-A1-A0SD-01A-11D-A111-01,TCGA-A1-A0SD-01
TCGA-A1-A0SE-01,Stage I,56.0,2.0,Stage I,1320.0,,Positive,FEMALE,Negative,,...,TCGA-A1-A0SE-01A-11D-A099-09,TCGA-A1-A0SE-01,a998e0ce-9248-460f-aabc-2dad452a1ff9,TCGA-A1-A0SE-01,TCGA-A1-A0SE-01A-11D-A087-01,,TCGA-A1-A0SE-01A-11R-A084-07,a998e0ce-9248-460f-aabc-2dad452a1ff9,TCGA-A1-A0SE-01A-11D-A087-01,TCGA-A1-A0SE-01
TCGA-A1-A0SF-01,Stage IIA,54.0,3.0,Stage IIA,1463.0,,Positive,FEMALE,Negative,,...,TCGA-A1-A0SF-01A-11D-A142-09,,28089e15-5e2c-4e83-ba6c-62b3cb40e431,,TCGA-A1-A0SF-01A-11D-A141-01,TCGA-A1-A0SF-01,,28089e15-5e2c-4e83-ba6c-62b3cb40e431,TCGA-A1-A0SF-01A-11D-A141-01,
TCGA-A1-A0SG-01,Stage IIB,61.0,4.0,Stage IIB,433.0,,Positive,FEMALE,Negative,,...,TCGA-A1-A0SG-01A-11D-A142-09,,0df6b948-367d-4951-9d98-d3bebabff63e,,TCGA-A1-A0SG-01A-11D-A141-01,TCGA-A1-A0SG-01,,0df6b948-367d-4951-9d98-d3bebabff63e,TCGA-A1-A0SG-01A-11D-A141-01,


In [4]:
commonIndexes=np.sort(list(set(df2.index).intersection(set(df.index)))) 
df=df.loc[commonIndexes] 
df2=df2.loc[commonIndexes] 
print("Data After:", df.shape) 
print("Clinical Data After:", df2.shape) 
print("CNV and Clinical Data corresponds to same Patients: ", False if False in (df.index==df2.index) else True) 
unique_elements, counts_elements = np.unique(df2.iloc[:,k], return_counts=True) 
print("Labels", unique_elements, counts_elements)

Data After: (539, 19277)
Clinical Data After: (539, 202)
CNV and Clinical Data corresponds to same Patients:  True
Labels ['Basal' 'Her2' 'LumA' 'LumB' 'Normal'] [ 86  31 279 126  17]


In [5]:
original_feature_names = df.columns.tolist()
X=np.array(df.values, dtype=float)
X.shape
Y=np.array(df2.values[:,k])
Y.shape

(539,)

In [6]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

In [7]:
folds = list(skf.split(X, Y))

# Select fold 4
train_idx, test_idx = folds[4]
X_train, X_test = X[train_idx], X[test_idx]
Y_train, Y_test = Y[train_idx], Y[test_idx]

X_train.shape, X_test.shape

((432, 19277), (107, 19277))

In [8]:
# Convert back to DataFrames for clarity and labeled saving
X_train_df = pd.DataFrame(X_train, columns=original_feature_names, index=np.arange(len(X_train)))
X_test_df = pd.DataFrame(X_test, columns=original_feature_names, index=np.arange(len(X_test)))
Y_train_df = pd.DataFrame(Y_train, columns=["Label"], index=np.arange(len(Y_train)))
Y_test_df = pd.DataFrame(Y_test, columns=["Label"], index=np.arange(len(Y_test)))

print("Train label distribution:\n", Y_train_df["Label"].value_counts())
print("Test label distribution:\n", Y_test_df["Label"].value_counts())

os.makedirs("splits", exist_ok=True)

# Save to CSV files
X_train_df.to_csv("Datasets/TCGA Dataset Splits/X_train.csv", index=False)
X_test_df.to_csv("Datasets/TCGA Dataset Splits/X_test.csv", index=False)
Y_train_df.to_csv("Datasets/TCGA Dataset Splits/Y_train.csv", index=False)
Y_test_df.to_csv("Datasets/TCGA Dataset Splits/Y_test.csv", index=False)

print("Train/test CSVs saved successfully under 'TCGA Dataset Splits/' folder.")

Train label distribution:
 Label
LumA      223
LumB      101
Basal      69
Her2       25
Normal     14
Name: count, dtype: int64
Test label distribution:
 Label
LumA      56
LumB      25
Basal     17
Her2       6
Normal     3
Name: count, dtype: int64
Train/test CSVs saved successfully under 'TCGA Dataset Splits/' folder.
