<!-- @format -->

### Implementation

1. split champ data into train and test

### Notes

1. Be aware of random seed raio setting
2. Ensure the number and position of tumor samples or normal samples in inputData
3. It takes around 5 min to finish breast cancer data
4. (PLEASE change the path by youself)

### Input Columns

1. `Unnamed: 0` - id of the sample
   > list of serial number for each sample

### Output File

1. training_data.csv
2. testing_data.csv

### Parameters

1. `seed` - make sure to change it if you want to have muiltple diffrent result
2. `train_ratio` - the ratio to split train and test from champ data
3. `isTumorFirst` - Ensure that tumor samples or normal samples are in the first n rows of the data set
4. `tumorNumber` - the total number of tumor data
5. `champDataPath` - path of input data file (PLEASE change the path by youself)
6. `outputTrainDataPath` - path of output train data file
7. `outputTestDataPath` - path of output test data file


In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
test_ratio = 0.2
seed = 42
isTumorFirst = False
tumorNumber = 50
normalNumber = 50
cancer_type = "rectal"
data_source = "GSE240324_nc"
champDataPath = f"../../{cancer_type}/champ_result/{data_source}/all_beta_normalized.csv"
outputTrainDataPath = f"../../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}/all_beta_normalized_train.csv"
outputTestDataPath = f"../../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}/all_beta_normalized_test.csv"

os.makedirs(
    f"../../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}", exist_ok=True
)
os.makedirs(
    f"../../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}",
    exist_ok=True,
)

In [3]:
df = pd.read_csv(champDataPath)
feature_name = df.iloc[:, 0].tolist()
X = df.iloc[:, 1::].T

if isTumorFirst:
    y = [(1 if i < tumorNumber else 0) for i in range((df.shape[1] - 1))]
else:
    y = [(0 if i < normalNumber else 1) for i in range((df.shape[1] - 1))]

In [4]:
len(y)

100

In [5]:
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=seed
)

print(f"訓練集樣本數量： {len(X_train)}")
print(f"測試集樣本數量： {len(X_test)}")
train_class_distribution = Counter(y_train)
val_class_distribution = Counter(y_test)
print("訓練集中各類別樣本數量：")
print(train_class_distribution)
print("測試集中各類別樣本數量：")
print(val_class_distribution)

訓練集樣本數量： 80
測試集樣本數量： 20
訓練集中各類別樣本數量：
Counter({1: 42, 0: 38})
測試集中各類別樣本數量：
Counter({0: 12, 1: 8})


In [6]:
X_train.columns = feature_name
X_train.index = y_train
X_train = X_train.sort_index()
train_df = X_train.T

X_test.columns = feature_name
X_test.index = y_test
X_test = X_test.sort_index()
test_df = X_test.T

train_df

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9
cg00000029,0.685549,0.791333,0.696890,0.623362,0.698149,0.710291,0.665143,0.696180,0.709970,0.630399,...,0.743897,0.761919,0.627261,0.652275,0.597384,0.555608,0.591851,0.600982,0.692012,0.726591
cg00000109,0.964831,0.975244,0.961848,0.962704,0.975189,0.963382,0.944664,0.965548,0.960843,0.964780,...,0.972724,0.948947,0.960394,0.970841,0.963756,0.952863,0.951014,0.975419,0.977375,0.971696
cg00000155,0.986728,0.987037,0.983501,0.981999,0.982945,0.983214,0.992226,0.984224,0.985806,0.986536,...,0.982742,0.989455,0.981693,0.986307,0.987674,0.981094,0.985012,0.985369,0.987768,0.978974
cg00000158,0.977880,0.986635,0.988113,0.992131,0.988678,0.984668,0.985334,0.989487,0.980792,0.995013,...,0.982665,0.982576,0.980850,0.989491,0.984987,0.981756,0.975262,0.992268,0.978997,0.993653
cg00000165,0.184111,0.155440,0.177690,0.170676,0.308850,0.227477,0.132275,0.127454,0.116430,0.145020,...,0.122764,0.163392,0.111906,0.168365,0.131222,0.130385,0.106051,0.155438,0.170329,0.171089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cg27666046,0.378769,0.360621,0.400223,0.496096,0.388571,0.421055,0.403705,0.419459,0.334062,0.444562,...,0.389395,0.441076,0.504492,0.346711,0.487945,0.499030,0.513211,0.445986,0.434811,0.381477
cg27666049,0.729204,0.795024,0.516987,0.669781,0.695430,0.558212,0.781682,0.653849,0.881698,0.771889,...,0.826163,0.740019,0.748155,0.614330,0.753090,0.728242,0.803610,0.742983,0.875664,0.814265
cg27666060,0.836714,0.870768,0.740984,0.869529,0.827582,0.701222,0.791696,0.834377,0.837241,0.884537,...,0.913279,0.835574,0.825491,0.762190,0.859499,0.860822,0.903881,0.844045,0.922338,0.786324
cg27666108,0.303649,0.362043,0.154818,0.222826,0.197095,0.166896,0.295587,0.283316,0.303922,0.234218,...,0.335193,0.275901,0.243597,0.239433,0.150168,0.214529,0.205401,0.289420,0.294389,0.214488


In [7]:
# export the training and testing sets to CSV
train_df.to_csv(outputTrainDataPath)
test_df.to_csv(outputTestDataPath)