<!-- @format -->

### Implementation

1. split champ data into train and test

### Notes

1. Be aware of random seed raio setting
2. Ensure the number and position of tumor samples or normal samples in inputData
3. It takes around 5 min to finish breast cancer data
4. (PLEASE change the path by youself)

### Input Columns

1. `Unnamed: 0` - id of the sample
   > list of serial number for each sample

### Output File

1. training_data.csv
2. testing_data.csv

### Parameters

1. `seed` - make sure to change it if you want to have muiltple diffrent result
2. `train_ratio` - the ratio to split train and test from champ data
3. `isTumorFirst` - Ensure that tumor samples or normal samples are in the first n rows of the data set
4. `tumorNumber` - the total number of tumor data
5. `champDataPath` - path of input data file (PLEASE change the path by youself)
6. `outputTrainDataPath` - path of output train data file
7. `outputTestDataPath` - path of output test data file


In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
test_ratio = 0.2
seed = 42
isTumorFirst = False
tumorNumber = 77
normalNumber = 72
cancer_type = "rectal"
data_source = "GSE199057"
champDataPath = f"../../{cancer_type}/champ_result/{data_source}/all_beta_normalized.csv"
outputTrainDataPath = f"../../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}/all_beta_normalized_train.csv"
outputTestDataPath = f"../../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}/all_beta_normalized_test.csv"

os.makedirs(
    f"../../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}", exist_ok=True
)
os.makedirs(
    f"../../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}",
    exist_ok=True,
)

In [3]:
df = pd.read_csv(champDataPath)
df = df.iloc[:,::2]
feature_name = df.iloc[:, 0].tolist()
X = df.iloc[:, 1::].T

if isTumorFirst:
    y = [(1 if i < tumorNumber else 0) for i in range((df.shape[1] - 1))]
else:
    y = [(0 if i < normalNumber else 1) for i in range((df.shape[1] - 1))]

In [4]:
len(y)

149

In [5]:
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=seed
)

print(f"訓練集樣本數量： {len(X_train)}")
print(f"測試集樣本數量： {len(X_test)}")
train_class_distribution = Counter(y_train)
val_class_distribution = Counter(y_test)
print("訓練集中各類別樣本數量：")
print(train_class_distribution)
print("測試集中各類別樣本數量：")
print(val_class_distribution)

訓練集樣本數量： 119
測試集樣本數量： 30
訓練集中各類別樣本數量：
Counter({1: 62, 0: 57})
測試集中各類別樣本數量：
Counter({1: 15, 0: 15})


In [6]:
X_train.columns = feature_name
X_train.index = y_train
X_train = X_train.sort_index()
train_df = X_train.T

X_test.columns = feature_name
X_test.index = y_test
X_test = X_test.sort_index()
test_df = X_test.T

train_df

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9
cg07881041,0.947221,0.955134,0.937806,0.927657,0.943390,0.941948,0.932209,0.943063,0.928462,0.901856,...,0.951451,0.067415,0.541346,0.861589,0.960066,0.946641,0.638304,0.463376,0.949181,0.757944
cg03513874,0.909695,0.920491,0.925137,0.901962,0.895070,0.902068,0.921672,0.945417,0.928434,0.897243,...,0.746939,0.060735,0.722361,0.961512,0.915357,0.957298,0.552934,0.749811,0.919204,0.948191
cg05451842,0.005403,0.005489,0.013647,0.016891,0.015645,0.012364,0.017188,0.016248,0.027730,0.029583,...,0.014126,0.006233,0.012377,0.006100,0.028174,0.013747,0.021015,0.007338,0.017772,0.010355
cg14797042,0.960886,0.962603,0.975298,0.966694,0.955637,0.979772,0.966109,0.977920,0.976508,0.971961,...,0.724348,0.957414,0.766883,0.978316,0.955733,0.988167,0.961027,0.994213,0.656115,0.885364
cg09838562,0.003848,0.002696,0.007891,0.002015,0.006671,0.006101,0.008617,0.011893,0.009229,0.010821,...,0.015461,0.003194,0.011377,0.008246,0.019245,0.006755,0.014846,0.008213,0.010256,0.011320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cg05166473,0.670941,0.618970,0.725240,0.720403,0.719170,0.628306,0.634759,0.622905,0.632116,0.575766,...,0.885471,0.929439,0.891500,0.949099,0.897633,0.842593,0.797810,0.850859,0.902517,0.847621
cg19812938,0.890241,0.878222,0.908148,0.918039,0.890004,0.887585,0.875836,0.891920,0.880273,0.887084,...,0.882515,0.837224,0.900504,0.909333,0.931023,0.895504,0.896576,0.902736,0.898196,0.901002
cg06272054,0.000027,0.004196,0.012988,0.008658,0.004820,0.003910,0.006899,0.001434,0.010015,0.006153,...,0.011883,0.013886,0.009737,0.004500,0.018384,0.005390,0.005127,0.005440,0.009391,0.005122
cg07255356,0.008043,0.002443,0.018837,0.008698,0.021748,0.010034,0.019939,0.013485,0.010751,0.017805,...,0.016718,0.023454,0.009452,0.003466,0.019117,0.006790,0.021291,0.017636,0.009163,0.012227


In [7]:
# export the training and testing sets to CSV
train_df.to_csv(outputTrainDataPath)
test_df.to_csv(outputTestDataPath)