<!-- @format -->

### Implementation

1. split champ data into train and test

### Notes

1. Be aware of random seed raio setting
2. Ensure the number and position of tumor samples or normal samples in inputData
3. It takes around 5 min to finish breast cancer data
4. (PLEASE change the path by youself)

### Input Columns

1. `Unnamed: 0` - id of the sample
   > list of serial number for each sample

### Output File

1. training_data.csv
2. testing_data.csv

### Parameters

1. `seed` - make sure to change it if you want to have muiltple diffrent result
2. `train_ratio` - the ratio to split train and test from champ data
3. `isTumorFirst` - Ensure that tumor samples or normal samples are in the first n rows of the data set
4. `tumorNumber` - the total number of tumor data
5. `champDataPath` - path of input data file (PLEASE change the path by youself)
6. `outputTrainDataPath` - path of output train data file
7. `outputTestDataPath` - path of output test data file


In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
test_ratio = 0.2
seed = 42
isTumorFirst = False
tumorNumber = 77
normalNumber = 152
cancer_type = "rectal"
data_source = "GSE199057"
champDataPath = f"../../{cancer_type}/champ_result/{data_source}/all_beta_normalized.csv"
outputTrainDataPath = f"../../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}/all_beta_normalized_train.csv"
outputTestDataPath = f"../../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}/all_beta_normalized_test.csv"

os.makedirs(
    f"../../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}", exist_ok=True
)
os.makedirs(
    f"../../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}",
    exist_ok=True,
)

In [3]:
df = pd.read_csv(champDataPath)
df = df.iloc[:,::2]
feature_name = df.iloc[:, 0].tolist()
X = df.iloc[:, 1::].T

if isTumorFirst:
    y = [(1 if i < tumorNumber else 0) for i in range((df.shape[1] - 1))]
else:
    y = [(0 if i < normalNumber else 1) for i in range((df.shape[1] - 1))]

In [4]:
len(y)

229

In [5]:
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=seed
)

print(f"訓練集樣本數量： {len(X_train)}")
print(f"測試集樣本數量： {len(X_test)}")
train_class_distribution = Counter(y_train)
val_class_distribution = Counter(y_test)
print("訓練集中各類別樣本數量：")
print(train_class_distribution)
print("測試集中各類別樣本數量：")
print(val_class_distribution)

訓練集樣本數量： 183
測試集樣本數量： 46
訓練集中各類別樣本數量：
Counter({0: 122, 1: 61})
測試集中各類別樣本數量：
Counter({0: 30, 1: 16})


In [6]:
X_train.columns = feature_name
X_train.index = y_train
X_train = X_train.sort_index()
train_df = X_train.T

X_test.columns = feature_name
X_test.index = y_test
X_test = X_test.sort_index()
test_df = X_test.T

train_df

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9
cg07881041,0.949957,0.952146,0.904001,0.954637,0.928165,0.932608,0.924301,0.923104,0.927477,0.935669,...,0.749822,0.940172,0.805832,0.756693,0.945328,0.923933,0.860106,0.928845,0.960072,0.069006
cg03513874,0.937071,0.889363,0.843358,0.945049,0.913755,0.857649,0.919933,0.909622,0.838915,0.935467,...,0.773122,0.759051,0.809406,0.947666,0.897360,0.591877,0.960830,0.820967,0.902224,0.062067
cg05451842,0.016502,0.006419,0.016312,0.022076,0.011215,0.015107,0.049540,0.006533,0.016118,0.038684,...,0.015439,0.012183,0.028709,0.011260,0.029149,0.013135,0.006605,0.018643,0.013003,0.006081
cg14797042,0.962613,0.985027,0.964741,0.958186,0.967807,0.982290,0.965621,0.974083,0.958924,0.977678,...,0.793072,0.965707,0.777952,0.882774,0.934462,0.324543,0.977903,0.966190,0.971721,0.956601
cg09838562,0.006557,0.007492,0.009818,0.002810,0.011285,0.003184,0.014299,0.005508,0.009941,0.011887,...,0.005610,0.034703,0.028405,0.012297,0.011081,0.005421,0.008825,0.004700,0.011022,0.003067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cg19812938,0.894460,0.878976,0.881958,0.896522,0.890799,0.872061,0.901984,0.908596,0.889975,0.878928,...,0.910865,0.877215,0.886959,0.901002,0.894252,0.885177,0.909333,0.902665,0.908721,0.837224
cg06272054,0.008978,0.004395,0.004080,0.004744,0.005768,0.008339,0.006556,0.005311,0.012396,0.003214,...,0.007236,0.008636,0.005029,0.005122,0.009000,0.004814,0.004500,0.008904,0.003809,0.013886
cg07255356,0.011516,0.009376,0.014099,0.016191,0.008926,0.013146,0.014820,0.003522,0.004258,0.021594,...,0.009114,0.012956,0.016731,0.012227,0.010212,0.015228,0.003466,0.005697,0.001893,0.023454
cg24220897,0.936086,0.921546,0.929182,0.897121,0.935147,0.924128,0.920541,0.937140,0.926174,0.920902,...,0.942277,0.940429,0.927275,0.957908,0.930148,0.878596,0.825669,0.917967,0.944465,0.953320


In [7]:
# export the training and testing sets to CSV
train_df.to_csv(outputTrainDataPath)
test_df.to_csv(outputTestDataPath)