In [1]:
# Then import the required libraries
import importlib
import numpy as np
import os

import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
seed = 0
np.random.seed(seed)

df = pd.read_csv("gse84207_gene_methylation_PURIFIED_DefaultPurity.csv", index_col=0)
df.head()

Unnamed: 0,GSM2229021,GSM2229022,GSM2229023,GSM2229024,GSM2229026,GSM2229027,GSM2229028,GSM2229029,GSM2229030,GSM2229032,...,GSM2229341,GSM2229342,GSM2229343,GSM2229344,GSM2229345,GSM2229346,GSM2229347,GSM2229348,GSM2229349,GSM2229350
RBL2,0.112403,0.10106,0.095218,0.09086,0.087599,0.110299,0.117564,0.125427,0.086942,0.115448,...,0.111565,0.135917,0.10556,0.106532,0.112789,0.107964,0.104233,0.100568,0.082234,0.109834
FNDC3B,0.735007,0.790631,0.753667,0.740216,0.691011,0.692239,0.650947,0.743113,0.771329,0.702595,...,0.706686,0.649273,0.707946,0.666453,0.614198,0.705967,0.764238,0.71568,0.604819,0.708434
VDAC3,0.166826,0.168968,0.197198,0.134869,0.17741,0.156894,0.164079,0.228601,0.143468,0.195091,...,0.177132,0.189562,0.17204,0.172447,0.165003,0.189963,0.144377,0.193545,0.164592,0.219432
ACTN1,0.51444,0.479087,0.645187,0.517086,0.553116,0.440605,0.498123,0.481203,0.523531,0.486245,...,0.540677,0.424241,0.550939,0.390804,0.611046,0.555121,0.635255,0.470427,0.435847,0.536374
SFRP1,0.544643,0.366042,0.34726,0.517022,0.527043,0.503843,0.416592,0.479446,0.227365,0.430428,...,0.440539,0.352329,0.419153,0.399265,0.386846,0.376672,0.353858,0.333272,0.437444,0.357898


In [3]:
df2 = pd.read_csv("GSE84207_metadata_full_filtered.csv")

df2 = df2.set_index("GSM_ID")
df2.head()

Unnamed: 0_level_0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,molecule_ch1,...,data_row_count,gender,er status,pam50,cohort,PAM50,Gender,ER_Status,Cohort,Tumor_Purity
GSM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM2229021,Breast_tumor_1302,GSM2229021,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1302,Homo sapiens,genomic DNA,...,436506,Female,pos,LumA,OSL2,LumA,Female,pos,OSL2,0.620834
GSM2229022,Breast_tumor_1309,GSM2229022,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1309,Homo sapiens,genomic DNA,...,436506,Female,pos,LumB,OSL2,LumB,Female,pos,OSL2,0.572767
GSM2229023,Breast_tumor_1094,GSM2229023,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1094,Homo sapiens,genomic DNA,...,436506,Female,pos,LumA,OSL2,LumA,Female,pos,OSL2,0.549891
GSM2229024,Breast_tumor_1272,GSM2229024,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1272,Homo sapiens,genomic DNA,...,436506,Female,pos,Normal,OSL2,Normal,Female,pos,OSL2,0.75844
GSM2229026,Breast_tumor_1169,GSM2229026,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1169,Homo sapiens,genomic DNA,...,436506,Female,pos,LumB,OSL2,LumB,Female,pos,OSL2,0.617934


In [4]:
common_samples = df.columns.intersection(df2.index)

df = df[common_samples]
df2 = df2.loc[common_samples]
pam50 = df2.loc[df.columns, "PAM50"]

In [5]:
X = df.T            # samples × genes
y = pam50           # PAM50 subtype

In [6]:
print(X.shape)
print(y.shape)

print(pd.Series(y).value_counts())

# confirm matching
print(X.index.equals(pam50.index))

(272, 21101)
(272,)
PAM50
LumA      120
LumB       63
Her2       37
Basal      34
Normal     18
Name: count, dtype: int64
True


In [7]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

In [8]:
folds = list(skf.split(X, y))

train_idx, test_idx = folds[4]

X_train = X.iloc[train_idx]
X_test  = X.iloc[test_idx]

Y_train = y.iloc[train_idx]
Y_test  = y.iloc[test_idx]

In [9]:
print("Train label distribution:\n", Y_train.value_counts())
print("Test label distribution:\n", Y_test.value_counts())

Train label distribution:
 PAM50
LumA      96
LumB      51
Her2      30
Basal     27
Normal    14
Name: count, dtype: int64
Test label distribution:
 PAM50
LumA      24
LumB      12
Her2       7
Basal      7
Normal     4
Name: count, dtype: int64


In [10]:
X_train_df = X_train.copy()
X_test_df  = X_test.copy()

Y_train_df = pd.DataFrame({"Label": Y_train})
Y_test_df  = pd.DataFrame({"Label": Y_test})

# Save to CSV files
os.makedirs("Datasets/Norway InfiniumPurify Default Dataset Splits", exist_ok=True)

X_train_df.to_csv("Datasets/Norway InfiniumPurify Default Dataset Splits/X_train.csv", index=False)
X_test_df.to_csv("Datasets/Norway InfiniumPurify Default Dataset Splits/X_test.csv", index=False)
Y_train_df.to_csv("Datasets/Norway InfiniumPurify Default Dataset Splits/Y_train.csv", index=False)
Y_test_df.to_csv("Datasets/Norway InfiniumPurify Default Dataset Splits/Y_test.csv", index=False)


print("Train/test CSVs saved successfully under 'Datasets/Norway InfiniumPurify Default Dataset Splits/' folder.")

Train/test CSVs saved successfully under 'Datasets/Norway InfiniumPurify Default Dataset Splits/' folder.
