<!-- @format -->

### Implementation

1. split champ data into train and test

### Notes

1. Be aware of random seed raio setting
2. Ensure the number and position of tumor samples or normal samples in inputData
3. It takes around 5 min to finish breast cancer data
4. (PLEASE change the path by youself)

### Input Columns

1. `Unnamed: 0` - id of the sample
   > list of serial number for each sample

### Output File

1. training_data.csv
2. testing_data.csv

### Parameters

1. `seed` - make sure to change it if you want to have muiltple diffrent result
2. `train_ratio` - the ratio to split train and test from champ data
3. `isTumorFirst` - Ensure that tumor samples or normal samples are in the first n rows of the data set
4. `tumorNumber` - the total number of tumor data
5. `champDataPath` - path of input data file (PLEASE change the path by youself)
6. `outputTrainDataPath` - path of output train data file
7. `outputTestDataPath` - path of output test data file


In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [49]:
test_ratio = 0.2
seed = 42

isTumorFirst = False

tumorNumber = 46


champDataPath = "../champ_result/breast/GSE89093/all_beta_normalized.csv"

outputTrainDataPath = "../champ_result/breast/GSE89093/train80/all_beta_normalized_train.csv"

outputTestDataPath = "../champ_result/breast/GSE89093/test20/all_beta_normalized_test.csv"

In [45]:
df = pd.read_csv(champDataPath)
X = df.iloc[:, 1::]

X = [X.iloc[i, :].values.flatten().tolist() for i in range((df.shape[1] - 1))]

if isTumorFirst:

    y = [(1 if i < tumorNumber else 0) for i in range((df.shape[1] - 1))]
else:

    y = [(0 if i < tumorNumber else 1) for i in range((df.shape[1] - 1))]

In [46]:
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=seed
)

print(f"訓練集樣本數量： {len(X_train)}")
print(f"測試集樣本數量： {len(X_test)}")
train_class_distribution = Counter(y_train)
val_class_distribution = Counter(y_test)
print("訓練集中各類別樣本數量：")
print(train_class_distribution)
print("測試集中各類別樣本數量：")
print(val_class_distribution)

訓練集樣本數量： 73
測試集樣本數量： 19
訓練集中各類別樣本數量：
Counter({1: 39, 0: 34})
測試集中各類別樣本數量：
Counter({0: 12, 1: 7})


In [47]:
train_df = pd.concat(
    [pd.DataFrame(X_train), pd.DataFrame(y_train)], ignore_index=True, axis=1
).T
test_df = pd.concat(
    [pd.DataFrame(X_test), pd.DataFrame(y_test)], ignore_index=True, axis=1
).T

train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,72
0,0.966480,0.886019,0.341486,0.121291,0.023645,0.016780,0.161193,0.927775,0.003568,0.070611,...,0.984875,0.864973,0.010533,0.987604,0.709534,0.893151,0.965101,0.016654,0.020300,0.954867
1,0.840825,0.870335,0.479495,0.100361,0.034151,0.045528,0.160503,0.885394,0.055274,0.105381,...,0.951471,0.832626,0.013813,0.935903,0.570494,0.903168,0.965087,0.027192,0.081737,0.930983
2,0.902271,0.899128,0.374862,0.133068,0.045744,0.044286,0.153349,0.914751,0.059824,0.114785,...,0.948615,0.846534,0.014927,0.949595,0.609926,0.897227,0.969545,0.029134,0.097894,0.926837
3,0.899593,0.924413,0.395068,0.102765,0.034550,0.022523,0.124047,0.911877,0.028765,0.070404,...,0.949571,0.909452,0.014806,0.964968,0.533486,0.917045,0.969183,0.023916,0.080940,0.943548
4,0.873340,0.905965,0.390162,0.140182,0.026735,0.033754,0.146212,0.914892,0.031482,0.114935,...,0.963231,0.893994,0.008934,0.978914,0.640235,0.907118,0.964920,0.015861,0.108327,0.961504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,0.892033,0.907596,0.336657,0.136509,0.022937,0.040230,0.087413,0.901168,0.060014,0.105203,...,0.937965,0.859628,0.025051,0.923507,0.690872,0.900703,0.948999,0.022791,0.067639,0.931012
89,0.925233,0.869121,0.361048,0.125111,0.037475,0.022264,0.173131,0.882924,0.064350,0.073902,...,0.965013,0.862081,0.015167,0.966741,0.612777,0.885803,0.951490,0.025854,0.062234,0.932376
90,0.922270,0.812604,0.405159,0.168911,0.032738,0.016767,0.164239,0.882213,0.030037,0.101702,...,0.942853,0.794530,0.016730,0.986657,0.683085,0.885435,0.968562,0.021559,0.068904,0.928814
91,0.913482,0.899764,0.361510,0.219494,0.036627,0.033445,0.184807,0.892888,0.059478,0.096568,...,0.945216,0.921912,0.024452,0.977902,0.773676,0.895496,0.958943,0.027506,0.059903,0.942456


In [48]:
# export the training and testing sets to CSV
train_df.to_csv(outputTrainDataPath, index=False)

test_df.to_csv(outputTestDataPath, index=False)