In [29]:
# Then import the required libraries
import importlib
import numpy as np
import os

import pandas as pd
from sklearn.model_selection import train_test_split

In [30]:
seed = 0
np.random.seed(seed)

df = pd.read_csv("gse84207_gene_methylation_PURIFIED.csv", index_col=0)
df.head()

Unnamed: 0,GSM2229021,GSM2229022,GSM2229023,GSM2229024,GSM2229026,GSM2229027,GSM2229028,GSM2229029,GSM2229030,GSM2229032,...,GSM2229341,GSM2229342,GSM2229343,GSM2229344,GSM2229345,GSM2229346,GSM2229347,GSM2229348,GSM2229349,GSM2229350
RBL2,0.112958,0.09895,0.090991,0.105501,0.089627,0.115214,0.127572,0.110668,0.108231,0.105266,...,0.108596,0.125871,0.101247,0.105406,0.121625,0.099054,0.096605,0.098737,0.085956,0.092465
FNDC3B,0.738122,0.813455,0.786635,0.754429,0.68916,0.70328,0.636826,0.758028,0.79718,0.707669,...,0.721834,0.678143,0.737784,0.667528,0.609387,0.706294,0.766494,0.742786,0.653784,0.733895
VDAC3,0.181599,0.178742,0.203146,0.154954,0.194182,0.170661,0.177092,0.188822,0.171823,0.185784,...,0.185757,0.184165,0.165175,0.184551,0.192282,0.171458,0.133,0.168043,0.175144,0.198729
ACTN1,0.529435,0.496529,0.634629,0.537686,0.555479,0.458291,0.511229,0.4717,0.54808,0.505746,...,0.533624,0.432833,0.517355,0.415232,0.616253,0.563908,0.63806,0.487601,0.454509,0.528607
SFRP1,0.489057,0.31909,0.341291,0.431903,0.47436,0.418367,0.406106,0.383555,0.233501,0.357794,...,0.370972,0.325062,0.344594,0.334357,0.36442,0.320306,0.349684,0.244205,0.379292,0.286073


In [31]:
df2 = pd.read_csv("GSE84207_metadata_full_filtered.csv")

df2 = df2.set_index("GSM_ID")
df2.head()

Unnamed: 0_level_0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,molecule_ch1,...,data_row_count,gender,er status,pam50,cohort,PAM50,Gender,ER_Status,Cohort,Tumor_Purity
GSM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM2229021,Breast_tumor_1302,GSM2229021,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1302,Homo sapiens,genomic DNA,...,436506,Female,pos,LumA,OSL2,LumA,Female,pos,OSL2,0.620834
GSM2229022,Breast_tumor_1309,GSM2229022,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1309,Homo sapiens,genomic DNA,...,436506,Female,pos,LumB,OSL2,LumB,Female,pos,OSL2,0.572767
GSM2229023,Breast_tumor_1094,GSM2229023,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1094,Homo sapiens,genomic DNA,...,436506,Female,pos,LumA,OSL2,LumA,Female,pos,OSL2,0.549891
GSM2229024,Breast_tumor_1272,GSM2229024,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1272,Homo sapiens,genomic DNA,...,436506,Female,pos,Normal,OSL2,Normal,Female,pos,OSL2,0.75844
GSM2229026,Breast_tumor_1169,GSM2229026,Public on Nov 15 2017,Jul 08 2016,Nov 15 2017,genomic,1,Fresh_frozen_breast_tumor_1169,Homo sapiens,genomic DNA,...,436506,Female,pos,LumB,OSL2,LumB,Female,pos,OSL2,0.617934


In [32]:
common_samples = df.columns.intersection(df2.index)

df = df[common_samples]
df2 = df2.loc[common_samples]
pam50 = df2.loc[df.columns, "PAM50"]

In [33]:
X = df.T            # samples × genes
y = pam50           # PAM50 subtype

In [34]:
print(X.shape)
print(y.shape)

print(pd.Series(y).value_counts())

# confirm matching
print(X.index.equals(pam50.index))

(272, 21101)
(272,)
PAM50
LumA      120
LumB       63
Her2       37
Basal      34
Normal     18
Name: count, dtype: int64
True


In [35]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

In [36]:
folds = list(skf.split(X, y))

train_idx, test_idx = folds[4]

X_train = X.iloc[train_idx]
X_test  = X.iloc[test_idx]

Y_train = y.iloc[train_idx]
Y_test  = y.iloc[test_idx]

In [37]:
print("Train label distribution:\n", Y_train.value_counts())
print("Test label distribution:\n", Y_test.value_counts())

Train label distribution:
 PAM50
LumA      96
LumB      51
Her2      30
Basal     27
Normal    14
Name: count, dtype: int64
Test label distribution:
 PAM50
LumA      24
LumB      12
Her2       7
Basal      7
Normal     4
Name: count, dtype: int64


In [38]:
X_train_df = X_train.copy()
X_test_df  = X_test.copy()

Y_train_df = pd.DataFrame({"Label": Y_train})
Y_test_df  = pd.DataFrame({"Label": Y_test})

# Save to CSV files
os.makedirs("Datasets/Norway InfiniumPurify Dataset Splits", exist_ok=True)

X_train_df.to_csv("Datasets/Norway InfiniumPurify Dataset Splits/X_train.csv", index=False)
X_test_df.to_csv("Datasets/Norway InfiniumPurify Dataset Splits/X_test.csv", index=False)
Y_train_df.to_csv("Datasets/Norway InfiniumPurify Dataset Splits/Y_train.csv", index=False)
Y_test_df.to_csv("Datasets/Norway InfiniumPurify Dataset Splits/Y_test.csv", index=False)


print("Train/test CSVs saved successfully under 'Datasets/Norway InfiniumPurify Dataset Splits/' folder.")

Train/test CSVs saved successfully under 'Datasets/Norway InfiniumPurify Dataset Splits/' folder.
