<!-- @format -->

### Implementation

1. split champ data into train and test

### Notes

1. Be aware of random seed raio setting
2. Ensure the number and position of tumor samples or normal samples in inputData
3. It takes around 5 min to finish breast cancer data
4. (PLEASE change the path by youself)

### Input Columns

1. `Unnamed: 0` - id of the sample
   > list of serial number for each sample

### Output File

1. training_data.csv
2. testing_data.csv

### Parameters

1. `seed` - make sure to change it if you want to have muiltple diffrent result
2. `train_ratio` - the ratio to split train and test from champ data
3. `isTumorFirst` - Ensure that tumor samples or normal samples are in the first n rows of the data set
4. `tumorNumber` - the total number of tumor data
5. `champDataPath` - path of input data file (PLEASE change the path by youself)
6. `outputTrainDataPath` - path of output train data file
7. `outputTestDataPath` - path of output test data file


In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
test_ratio = 0.2
seed = 42
isTumorFirst = False
tumorNumber = 46
cancer_type = "breast"
data_source = "GSE89093_nc"
champDataPath = f"../{cancer_type}/champ_result/{data_source}/all_beta_normalized.csv"
outputTrainDataPath = f"../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}/all_beta_normalized_train.csv"
outputTestDataPath = f"../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}/all_beta_normalized_test.csv"

os.makedirs(
    f"../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}", exist_ok=True
)
os.makedirs(
    f"../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}",
    exist_ok=True,
)

In [3]:
df = pd.read_csv(champDataPath)
feature_name = df.iloc[:, 0].tolist()
X = df.iloc[:, 1::].T

if isTumorFirst:
    y = [(1 if i < tumorNumber else 0) for i in range((df.shape[1] - 1))]
else:
    y = [(0 if i < tumorNumber else 1) for i in range((df.shape[1] - 1))]

In [4]:
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=seed
)

print(f"訓練集樣本數量： {len(X_train)}")
print(f"測試集樣本數量： {len(X_test)}")
train_class_distribution = Counter(y_train)
val_class_distribution = Counter(y_test)
print("訓練集中各類別樣本數量：")
print(train_class_distribution)
print("測試集中各類別樣本數量：")
print(val_class_distribution)

訓練集樣本數量： 73
測試集樣本數量： 19
訓練集中各類別樣本數量：
Counter({1: 39, 0: 34})
測試集中各類別樣本數量：
Counter({0: 12, 1: 7})


In [5]:
X_train.columns = feature_name
X_train["label"] = y_train
train_df = X_train.T

X_test.columns = feature_name
X_test["label"] = y_test
test_df = X_test.T

train_df

Unnamed: 0,65,15,68,78,30,33,11,66,69,31,...,2,23,85,74,82,20,60,71,14,51
cg00000029,0.586510,0.525121,0.554625,0.554975,0.557304,0.497707,0.365151,0.558329,0.610559,0.320195,...,0.440157,0.549408,0.570047,0.546201,0.586559,0.488822,0.601729,0.513869,0.492984,0.651465
cg00000108,0.992230,0.991957,0.975934,0.989343,0.993361,0.994774,0.993796,0.993860,0.973230,0.989572,...,0.991012,0.990799,0.994711,0.992986,0.980657,0.989034,0.991628,0.984695,0.994670,0.981124
cg00000109,0.969259,0.936234,0.950658,0.927532,0.958945,0.946654,0.962139,0.950760,0.943144,0.951834,...,0.948615,0.967846,0.955616,0.940197,0.966235,0.946032,0.956851,0.956056,0.944027,0.917831
cg00000165,0.259568,0.205393,0.176361,0.315281,0.214120,0.164383,0.154257,0.204182,0.133943,0.157087,...,0.158073,0.199542,0.170724,0.244649,0.164425,0.161977,0.255724,0.160116,0.177354,0.230079
cg00000236,0.838320,0.770946,0.874965,0.857617,0.768862,0.820783,0.833375,0.686259,0.797777,0.773858,...,0.745585,0.791442,0.831788,0.837832,0.795153,0.720465,0.836784,0.798206,0.718977,0.845227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ch.9.98937537R,0.015070,0.006520,0.017886,0.019029,0.024125,0.019926,0.006886,0.016649,0.021746,0.010235,...,0.022522,0.007851,0.008783,0.012621,0.010352,0.012220,0.008065,0.004420,0.010604,0.011318
ch.9.98957343R,0.026990,0.001945,0.030100,0.022159,0.016604,0.007559,0.020598,0.037215,0.032673,0.010187,...,0.030792,0.006268,0.023771,0.020220,0.014115,0.016410,0.031972,0.014719,0.012147,0.031671
ch.9.98959675F,0.146594,0.055965,0.161179,0.209367,0.062689,0.041468,0.051328,0.136338,0.195965,0.075712,...,0.086426,0.035983,0.072900,0.238724,0.135202,0.106237,0.035258,0.191182,0.027082,0.167061
ch.9.991104F,0.077866,0.017367,0.051693,0.079770,0.070599,0.036526,0.036531,0.054047,0.059321,0.035140,...,0.040278,0.088289,0.036085,0.088602,0.023780,0.069880,0.068798,0.084636,0.026892,0.126820


In [6]:
# export the training and testing sets to CSV
train_df.to_csv(outputTrainDataPath)
test_df.to_csv(outputTestDataPath)