<!-- @format -->

### Implementation

1. split champ data into train and test

### Notes

1. Be aware of random seed raio setting
2. Ensure the number and position of tumor samples or normal samples in inputData
3. It takes around 5 min to finish breast cancer data
4. (PLEASE change the path by youself)

### Input Columns

1. `Unnamed: 0` - id of the sample
   > list of serial number for each sample

### Output File

1. training_data.csv
2. testing_data.csv

### Parameters

1. `seed` - make sure to change it if you want to have muiltple diffrent result
2. `train_ratio` - the ratio to split train and test from champ data
3. `isTumorFirst` - Ensure that tumor samples or normal samples are in the first n rows of the data set
4. `tumorNumber` - the total number of tumor data
5. `champDataPath` - path of input data file (PLEASE change the path by youself)
6. `outputTrainDataPath` - path of output train data file
7. `outputTestDataPath` - path of output test data file


In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
test_ratio = 0.2
seed = 42
normalNumber = 37
cancer_type = "lung"
data_source = "GDC_lung_tissue"
champDataPath0 = f"../../{cancer_type}/champ_result/{data_source}/all_beta_normalized_0.csv"
champDataPath1 = f"../../{cancer_type}/champ_result/{data_source}/all_beta_normalized_1.csv"
outputTrainDataPath = f"../../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}/all_beta_normalized_0.csv"
outputTestDataPath = f"../../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}/all_beta_normalized_1.csv"

os.makedirs(
    f"../../{cancer_type}/result/{data_source}/test{int(test_ratio*100)}", exist_ok=True
)
os.makedirs(
    f"../../{cancer_type}/result/{data_source}/train{int(100-test_ratio*100)}",
    exist_ok=True,
)

In [3]:
df0 = pd.read_csv(champDataPath0)

In [4]:
df1 = pd.read_csv(champDataPath1)

### merge two datasets as one with first 74 columns being normal samples and the rest being tumor samples

In [5]:
feature_name0 = df0.iloc[:, 0].tolist()
feature_name1 = df1.iloc[:, 0].tolist()

# intersection of two feature names
feature_name = list(set(feature_name0).intersection(feature_name1))

print(f"feature_name0: {len(feature_name0)}")
print(f"feature_name1: {len(feature_name1)}")
print(f"feature_name: {len(feature_name)}")


feature_name0: 120677
feature_name1: 360242
feature_name: 115119


In [6]:
df0t = df0[df0.iloc[:, 0].isin(feature_name)]
df1t = df1[df1.iloc[:, 0].isin(feature_name)]

In [7]:
df0t = df0t.iloc[:, 1::2]
df1t = df1t.iloc[:, 1::2]
df0t.reset_index(drop=True, inplace=True)
df1t.reset_index(drop=True, inplace=True)

In [8]:
df0n = df0t.iloc[:, :normalNumber]
df0c = df0t.iloc[:, normalNumber:]
df1n = df1t.iloc[:, :normalNumber]
df1c = df1t.iloc[:, normalNumber:]

In [9]:
dfn = pd.concat([df0n, df1n], axis = 1)
dfc = pd.concat([df0c, df1c], axis = 1)

In [10]:
print(dfn.shape)
dfn.dropna(inplace = True, axis = 0)
print(dfn.shape)

(115119, 74)
(115119, 74)


In [11]:
print(dfc.shape)
dfc.dropna(inplace = True, axis = 1)
print(dfc.shape)

(115119, 844)
(115119, 677)


normal smaples: 74 columns<br>
tumor samples: 677 columns

In [12]:
df = pd.concat([dfn, dfc], axis = 1)

In [13]:
df.columns = range(df.shape[1])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,741,742,743,744,745,746,747,748,749,750
0,0.870131,0.855272,0.837964,0.881787,0.840490,0.880335,0.854351,0.866782,0.848926,0.868890,...,0.896841,0.856653,0.841191,0.849257,0.852827,0.882146,0.799261,0.860865,0.869139,0.865573
1,0.805265,0.786301,0.812133,0.702542,0.794556,0.874750,0.753124,0.925544,0.897843,0.750444,...,0.787945,0.750121,0.810318,0.841074,0.784213,0.798595,0.835029,0.716940,0.752071,0.859589
2,0.130379,0.105195,0.069474,0.065219,0.083583,0.098281,0.087544,0.070445,0.066433,0.080765,...,0.492008,0.452658,0.242886,0.612418,0.075720,0.562529,0.547632,0.196573,0.238353,0.687156
3,0.027859,0.028367,0.035275,0.009818,0.040193,0.036342,0.012196,0.034231,0.010206,0.028019,...,0.020259,0.047373,0.050682,0.063232,0.046314,0.044381,0.031653,0.044863,0.038549,0.023179
4,0.024905,0.028116,0.026357,0.031588,0.049386,0.057241,0.034706,0.049881,0.032275,0.046329,...,0.312339,0.221882,0.490271,0.554614,0.370159,0.709972,0.657312,0.063309,0.076774,0.673089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115114,0.989299,0.984082,0.975333,0.992034,0.959184,0.984114,0.985027,0.983505,0.987806,0.967371,...,0.965985,0.989422,0.977129,0.985238,0.977063,0.988363,0.962708,0.966283,0.981251,0.979623
115115,0.455539,0.461929,0.397068,0.394965,0.405328,0.432191,0.449824,0.421014,0.397822,0.481154,...,0.564481,0.354852,0.413000,0.562204,0.370212,0.152000,0.261364,0.431523,0.435254,0.193859
115116,0.057365,0.049908,0.061390,0.045488,0.048277,0.058233,0.043221,0.075484,0.057218,0.058064,...,0.056329,0.056685,0.072321,0.070273,0.084977,0.062259,0.055549,0.061345,0.080999,0.060776
115117,0.958313,0.954312,0.953241,0.962748,0.928654,0.948306,0.955545,0.969262,0.958781,0.951267,...,0.973862,0.970034,0.946837,0.948447,0.950156,0.931824,0.725174,0.960862,0.943178,0.980355


In [14]:
X = df.T
y = [(0 if i < normalNumber else 1) for i in range((df.shape[1]))]

In [15]:
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=seed
)

print(f"訓練集樣本數量： {len(X_train)}")
print(f"測試集樣本數量： {len(X_test)}")
train_class_distribution = Counter(y_train)
val_class_distribution = Counter(y_test)
print("訓練集中各類別樣本數量：")
print(train_class_distribution)
print("測試集中各類別樣本數量：")
print(val_class_distribution)

訓練集樣本數量： 600
測試集樣本數量： 151
訓練集中各類別樣本數量：
Counter({1: 570, 0: 30})
測試集中各類別樣本數量：
Counter({1: 144, 0: 7})


In [16]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115109,115110,115111,115112,115113,115114,115115,115116,115117,115118
131,0.841756,0.703298,0.431084,0.027505,0.397063,0.465172,0.249965,0.949241,0.012780,0.262894,...,0.027010,0.920087,0.850160,0.930655,0.791388,0.989132,0.384792,0.038485,0.973749,0.679401
44,0.878719,0.857571,0.079447,0.032354,0.046182,0.241271,0.315196,0.968882,0.026012,0.132082,...,0.022753,0.970901,0.661749,0.928384,0.682931,0.984485,0.495349,0.067969,0.942289,0.958157
70,0.802070,0.797716,0.096851,0.027988,0.017034,0.310766,0.233277,0.955857,0.015123,0.117587,...,0.040264,0.977996,0.726119,0.917069,0.759037,0.984149,0.420042,0.049243,0.947436,0.960082
673,0.862710,0.833682,0.316148,0.049158,0.329400,0.414738,0.204136,0.967899,0.014163,0.181442,...,0.024757,0.966543,0.646962,0.801688,0.803312,0.990239,0.416372,0.066303,0.980574,0.963004
208,0.892635,0.871973,0.510363,0.017838,0.430559,0.181806,0.155549,0.973625,0.022387,0.045736,...,0.029056,0.739023,0.643690,0.951082,0.654773,0.977833,0.497420,0.067781,0.979291,0.956772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.903994,0.701379,0.092457,0.050638,0.013913,0.236373,0.254050,0.954160,0.014197,0.082760,...,0.085104,0.970894,0.679033,0.891433,0.687496,0.989801,0.278334,0.027561,0.908630,0.883741
106,0.848019,0.714955,0.345282,0.044060,0.217289,0.286300,0.213746,0.933303,0.019338,0.163362,...,0.040914,0.936746,0.703963,0.932591,0.794732,0.973849,0.443481,0.042136,0.945318,0.949573
270,0.842057,0.844408,0.167503,0.030814,0.045414,0.271550,0.299056,0.955565,0.011885,0.289069,...,0.015548,0.949698,0.737791,0.930943,0.696165,0.968152,0.431557,0.055152,0.935072,0.805257
435,0.875423,0.607010,0.052774,0.022742,0.015006,0.178868,0.350928,0.967761,0.011369,0.070731,...,0.038441,0.555965,0.206392,0.409093,0.777240,0.976052,0.203283,0.056329,0.972555,0.937175


In [17]:
X_train.columns = feature_name
X_train["label"] = y_train
X_train = X_train.sort_values(by=["label"])
train_df = X_train.T
train_df.columns = range(train_df.shape[1])


X_test.columns = feature_name
X_test["label"] = y_test
X_test = X_test.sort_values(by=["label"])
test_df = X_test.T
test_df.columns = range(test_df.shape[1])

In [20]:
# export the training and testing sets to CSV
train_df.to_csv(outputTrainDataPath)
test_df.to_csv(outputTestDataPath)