# Data preparation

Notebook with basic data preparation and train/test split for future experiments

In [93]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

## Data loading and preparation

Load raw data (has to be placed in "raw_data_path" directory)

In [94]:
raw_data_path = '~/Documents/STUDIA/Projekt_badawczy/moje/TEPS_Data_preparation_data_sample_info'

patients = pd.read_csv(os.path.join(raw_data_path, 'SampleInfo_short_multiclass_2022-10-14.tsv'), sep='\t')
patients = patients.reset_index(drop=True)

patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1916 entries, 0 to 1915
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sample.ID        1916 non-null   object 
 1   Group            1916 non-null   object 
 2   Stage            1648 non-null   object 
 3   Sex              1898 non-null   object 
 4   Age              1898 non-null   object 
 5   Lib.size         1916 non-null   int64  
 6   Description      107 non-null    object 
 7   Comments         0 non-null      float64
 8   IsNew            1916 non-null   object 
 9   PotentialIssues  36 non-null     object 
 10  TR               1916 non-null   object 
 11  RealLocation     1916 non-null   object 
 12  MultiGroup       1916 non-null   object 
 13  MultiGroup2      1916 non-null   object 
 14  MultiGroup3      1916 non-null   object 
 15  TrainTest        1916 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory usage: 239.6+ KB


Check for different Stage and Group values

In [95]:
patients['Stage'].value_counts(dropna=False)

n.a.    794
IV      497
NaN     268
III     152
II      125
I        80
Name: Stage, dtype: int64

In [96]:
patients['Group'].value_counts(dropna=False)

NSCLC                             567
Asymptomatic controls             405
Pulmonary Hypertension            175
Ovarian cancer                    133
Glioma                            128
Pancreatic cancer                 123
Cholangiocarcinoma                 83
Multiple sclerosis                 83
Colorectal cancer                  80
Medically-intractable epilepsy     43
Endometrial cancer                 38
Angina pectoris                    26
Hepatocellular carcinoma           22
Esophageal carcinoma               10
Name: Group, dtype: int64

Filter out the "Asymptomatic control" values of Group and cases with undefined Stage

In [97]:
valid_stages = ('I', 'II', 'III', 'IV')
patients = patients.loc[patients['Stage'].isin(valid_stages)]

patients['Stage'].value_counts(dropna=False)

IV     497
III    152
II     125
I       80
Name: Stage, dtype: int64

In [98]:
patients['Group'].value_counts(dropna=False)

NSCLC                       404
Ovarian cancer              126
Pancreatic cancer           122
Cholangiocarcinoma           80
Colorectal cancer            63
Endometrial cancer           36
Hepatocellular carcinoma     14
Esophageal carcinoma          9
Name: Group, dtype: int64

Change data type of Age column to numeric

Change missing value markers to None

Rename column with patient IDs

Select only subset of available columns

In [99]:
patients['Age'] = pd.to_numeric(patients['Age'], errors='coerce')
patients.loc[patients['Sex'] == 'n.a.', 'Sex'] = None
patients = patients.rename(columns={'Sample.ID': 'ID'})

cols = [
    'ID',
    'Group',
    'Sex',
    'Age',
    'Stage'
]
patients = patients.loc[:, cols]
patients.info()

patients

<class 'pandas.core.frame.DataFrame'>
Int64Index: 854 entries, 328 to 1755
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      854 non-null    object 
 1   Group   854 non-null    object 
 2   Sex     854 non-null    object 
 3   Age     853 non-null    float64
 4   Stage   854 non-null    object 
dtypes: float64(1), object(4)
memory usage: 40.0+ KB


Unnamed: 0,ID,Group,Sex,Age,Stage
328,MGH-NSCLC-L-74,NSCLC,M,36.0,IV
329,Vumc-NSCLC-092,NSCLC,F,81.0,IV
331,MGH-NSCLC-L01-TR458,NSCLC,F,37.0,IV
332,MGH-NSCLC-L11-TR477,NSCLC,M,54.0,IV
334,MGH-NSCLC-L65-TR523,NSCLC,M,78.0,IV
...,...,...,...,...,...
1508,VUMC-141-2017432-PANC-TR3421,Pancreatic cancer,M,74.0,III
1636,MGH-NSCLC-L121-TR907,NSCLC,F,71.0,IV
1637,Vumc-NSCLC-216-TR874,NSCLC,M,68.0,II
1753,TR4277,Ovarian cancer,F,67.0,III


Load and transpose a dataframe with marker values

In [100]:
markers = pd.read_csv(os.path.join(raw_data_path, 'Counts_prefiltered_multiclass_2022-10-14.tsv'), sep='\t')
markers = markers.T.reset_index(names='ID')
markers

Unnamed: 0,ID,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,ENSG00000001631,ENSG00000002330,ENSG00000002549,ENSG00000002586,...,ENSG00000257267,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168
0,Vumc-HD-101-TR922,3.064289,3.834176,4.171537,4.737304,4.272177,3.940969,4.418057,4.668012,10.759357,...,6.788573,8.077838,3.546347,4.328872,4.328872,4.803181,3.128490,7.739557,6.582879,4.418057
1,Vumc-HD-103-TR923,5.194380,6.964049,4.644469,3.838500,3.951551,5.386353,4.537357,5.478881,10.215786,...,6.073116,6.388674,5.337690,4.444854,3.877458,5.152584,5.686621,7.055870,5.815763,3.951551
2,Vumc-HD-108-TR924,5.387337,7.608523,4.097419,3.871438,5.966998,4.877867,4.097419,5.992483,9.772417,...,5.789179,7.257840,4.932819,4.490325,3.807177,4.932819,6.549959,7.091888,6.042124,3.871438
3,Vumc-HD-127-TR925,6.584300,5.626849,5.076153,3.865364,4.355678,5.188931,4.745318,5.215744,9.867106,...,6.150602,5.586682,5.390227,4.627846,4.707302,5.342562,6.746681,7.691876,6.080439,4.920776
4,Vumc-HD-130-TR926,5.684044,5.990387,4.338011,4.072761,4.029651,4.994614,4.693579,5.862317,9.949440,...,6.760555,5.605931,5.292285,4.527899,3.839155,5.188022,3.786024,8.199582,6.456418,5.275442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1911,TR3298-HD-NKI,5.015461,9.866231,6.145266,5.179213,5.179213,5.890240,4.829431,6.662208,9.877926,...,5.015461,8.129104,6.221056,3.575926,5.179213,3.575926,7.724969,3.575926,2.377126,3.575926
1912,TR3481-HD-NKI,6.331082,9.694728,5.875447,5.577790,5.199813,5.401527,5.086781,7.410313,9.361917,...,4.312592,7.583496,6.889398,4.507972,4.507972,3.397669,6.855973,3.397669,3.397669,3.397669
1913,TR3300-HD-NKI,5.133399,10.264108,6.606876,5.278483,4.573871,5.743199,4.787213,8.089368,9.484785,...,5.640600,8.164524,6.167757,4.573871,5.278483,2.377126,7.178693,2.377126,3.551991,2.377126
1914,TR3296-HD-NKI,4.776702,9.413021,5.143608,5.517676,5.517676,6.056900,4.041595,6.753366,9.797785,...,5.940070,7.886525,5.940070,4.461142,5.246952,4.041595,7.406943,3.372442,2.377126,3.372442


In [101]:
col1_values = patients['ID'].tolist()
# usunięcie wierszy, w których wartość w kolumnie ID nie jest zawarta w kolumnie ID obiektu patients
markers = markers[markers['ID'].isin(col1_values)]
markers.info()
markers.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 854 entries, 328 to 1755
Columns: 4394 entries, ID to ENSG00000272168
dtypes: float64(4393), object(1)
memory usage: 28.6+ MB


Unnamed: 0,ID,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,ENSG00000001631,ENSG00000002330,ENSG00000002549,ENSG00000002586,...,ENSG00000257267,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168
328,MGH-NSCLC-L-74,2.969287,4.409404,4.430591,5.180834,3.917734,4.250287,4.225813,4.274288,10.590616,...,6.208095,8.587142,2.377126,3.980061,4.122732,4.531599,3.441556,6.351508,5.381818,4.122732
329,Vumc-NSCLC-092,3.571121,5.485827,4.319007,3.512896,4.989149,4.163728,4.508574,4.971457,10.709285,...,6.217765,7.786546,4.130173,4.347874,4.130173,4.228193,4.196383,7.217239,5.925721,4.800937
331,MGH-NSCLC-L01-TR458,2.377126,3.401266,4.086679,2.724877,4.057926,3.219006,4.057926,4.220985,10.755117,...,6.393598,7.620465,2.377126,4.409539,3.067701,5.577047,3.76077,7.859787,6.340376,5.334665
332,MGH-NSCLC-L11-TR477,4.349115,4.926478,4.169191,4.330355,3.521482,4.125374,4.403788,6.093601,10.5578,...,6.761947,7.449481,4.056508,4.21149,3.481807,5.759331,3.351229,7.911409,6.342396,5.256122
334,MGH-NSCLC-L65-TR523,3.267676,3.424495,3.309719,5.34257,3.309719,3.875896,4.246368,4.37395,10.94365,...,6.444714,8.177893,3.618036,4.327609,3.267676,4.670803,2.662768,7.236297,5.856442,5.400439


Sprawdzanie które geny mają najwięcej powtarzających się wartości

In [102]:
# list_most_frequent_counts = []

# # iteracja po kolumnach
# for column in markers.columns:
#     # zliczenie wystąpień każdej unikalnej wartości w kolumnie i wybór najczęściej występującej
#     value_counts = markers[column].value_counts()
#     most_frequent_value = value_counts.idxmax()
#     most_frequent_count = value_counts.max()
#     # wyświetlenie informacji o liczbie wystąpień najczęściej powtarzającej się wartości w kolumnie
#     print(f"Najczęściej występująca wartość w kolumnie {column}: {most_frequent_value}")
#     # dodanie słownika z nazwą kolumny i liczbą wystąpień najczęściej powtarzającej się wartości do listy
#     list_most_frequent_counts.append({'Column': column, 'Most Frequent Value': most_frequent_value, 'Count': most_frequent_count})

# # utworzenie obiektu DataFrame z listy słowników
# most_frequent_counts = pd.DataFrame(list_most_frequent_counts)

# # posortowanie DataFrame malejąco względem liczby wystąpień najczęściej powtarzającej się wartości
# most_frequent_counts = most_frequent_counts.sort_values(by='Count', ascending=True)

# most_freq = most_frequent_counts.head(200)

# most_freq

Ograniczenie liczby genów do 200, w których wartości najczęściej się powtarzały

In [103]:
# columns_to_keep = most_freq[most_freq['Column'] != 'ID']['Column'].tolist()
# markers = markers[['ID'] + columns_to_keep]
# markers.head(5)

Merge patients with their markers values

In [104]:
# połączenie obiektów patients i markers po kolumnie ID
merged = patients.merge(markers, on='ID', how='left')
merged.info()
merged.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 854 entries, 0 to 853
Columns: 4398 entries, ID to ENSG00000272168
dtypes: float64(4394), object(4)
memory usage: 28.7+ MB


Unnamed: 0,ID,Group,Sex,Age,Stage,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,...,ENSG00000257267,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168
0,MGH-NSCLC-L-74,NSCLC,M,36.0,IV,2.969287,4.409404,4.430591,5.180834,3.917734,...,6.208095,8.587142,2.377126,3.980061,4.122732,4.531599,3.441556,6.351508,5.381818,4.122732
1,Vumc-NSCLC-092,NSCLC,F,81.0,IV,3.571121,5.485827,4.319007,3.512896,4.989149,...,6.217765,7.786546,4.130173,4.347874,4.130173,4.228193,4.196383,7.217239,5.925721,4.800937
2,MGH-NSCLC-L01-TR458,NSCLC,F,37.0,IV,2.377126,3.401266,4.086679,2.724877,4.057926,...,6.393598,7.620465,2.377126,4.409539,3.067701,5.577047,3.76077,7.859787,6.340376,5.334665
3,MGH-NSCLC-L11-TR477,NSCLC,M,54.0,IV,4.349115,4.926478,4.169191,4.330355,3.521482,...,6.761947,7.449481,4.056508,4.21149,3.481807,5.759331,3.351229,7.911409,6.342396,5.256122
4,MGH-NSCLC-L65-TR523,NSCLC,M,78.0,IV,3.267676,3.424495,3.309719,5.34257,3.309719,...,6.444714,8.177893,3.618036,4.327609,3.267676,4.670803,2.662768,7.236297,5.856442,5.400439


In [105]:
merged2 = merged
merged2['Stage1'] = merged['Stage'].apply(lambda x: 1 if x == 'I' else 0)
merged2['Stage2'] = merged['Stage'].apply(lambda x: 1 if x == 'II' else 0)
merged2['Stage3'] = merged['Stage'].apply(lambda x: 1 if x == 'III' else 0)
merged2['Stage4'] = merged['Stage'].apply(lambda x: 1 if x == 'IV' else 0)
merged2

Unnamed: 0,ID,Group,Sex,Age,Stage,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,...,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168,Stage1,Stage2,Stage3,Stage4
0,MGH-NSCLC-L-74,NSCLC,M,36.0,IV,2.969287,4.409404,4.430591,5.180834,3.917734,...,4.122732,4.531599,3.441556,6.351508,5.381818,4.122732,0,0,0,1
1,Vumc-NSCLC-092,NSCLC,F,81.0,IV,3.571121,5.485827,4.319007,3.512896,4.989149,...,4.130173,4.228193,4.196383,7.217239,5.925721,4.800937,0,0,0,1
2,MGH-NSCLC-L01-TR458,NSCLC,F,37.0,IV,2.377126,3.401266,4.086679,2.724877,4.057926,...,3.067701,5.577047,3.760770,7.859787,6.340376,5.334665,0,0,0,1
3,MGH-NSCLC-L11-TR477,NSCLC,M,54.0,IV,4.349115,4.926478,4.169191,4.330355,3.521482,...,3.481807,5.759331,3.351229,7.911409,6.342396,5.256122,0,0,0,1
4,MGH-NSCLC-L65-TR523,NSCLC,M,78.0,IV,3.267676,3.424495,3.309719,5.342570,3.309719,...,3.267676,4.670803,2.662768,7.236297,5.856442,5.400439,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849,VUMC-141-2017432-PANC-TR3421,Pancreatic cancer,M,74.0,III,3.864656,4.667183,4.542353,4.781267,3.260633,...,3.260633,4.667183,3.608492,3.864656,3.608492,5.316957,0,0,1,0
850,MGH-NSCLC-L121-TR907,NSCLC,F,71.0,IV,3.339275,4.590402,4.366927,4.644051,3.822677,...,3.625855,4.282838,3.897177,5.990556,4.793104,4.173833,0,0,0,1
851,Vumc-NSCLC-216-TR874,NSCLC,M,68.0,II,4.129881,5.154147,4.408833,5.582722,4.129881,...,6.105969,6.031075,3.524623,7.868335,6.789048,5.002750,0,1,0,0
852,TR4277,Ovarian cancer,F,67.0,III,2.377126,2.377126,2.377126,2.377126,2.377126,...,2.377126,5.709359,3.307039,6.500167,5.188212,2.377126,0,0,1,0


In [106]:
#macierz korelacji
merged2 = merged2.drop(["Group", "Stage", "Sex", "Age"], axis=1)
correlation_matrix = merged2.corr()

  correlation_matrix = merged2.corr()


In [107]:
# Korelacje dla kazdego stage-a
stage_correlation1 = correlation_matrix['Stage1']
stage_correlation1 = stage_correlation1.sort_values(ascending=False)
stage_correlation1 = stage_correlation1.abs()
stage_correlation1 = stage_correlation1.head(100)

In [108]:
# Korelacje dla kazdego stage-a
stage_correlation2 = correlation_matrix['Stage2']
stage_correlation2 = stage_correlation2.sort_values(ascending=False)
stage_correlation2 = stage_correlation2.abs()
stage_correlation2 = stage_correlation2.head(100)

In [109]:
# Korelacje dla kazdego stage-a
stage_correlation3 = correlation_matrix['Stage3']
stage_correlation3 = stage_correlation3.sort_values(ascending=False)
stage_correlation3 = stage_correlation3.abs()
stage_correlation3 = stage_correlation3.head(100)

In [110]:
# Korelacje dla kazdego stage-a
stage_correlation4 = correlation_matrix['Stage4']
stage_correlation4 = stage_correlation4.sort_values(ascending=False)
stage_correlation4 = stage_correlation4.abs()
stage_correlation4 = stage_correlation4.head(100)

In [111]:
df1 = stage_correlation1.to_frame(name='kor')
df2 = stage_correlation2.to_frame(name='kor')
df3 = stage_correlation3.to_frame(name='kor')
df4 = stage_correlation4.to_frame(name='kor')

# common_rows = df2.merge(df3, left_index=True, right_index=True).index
# sorted_df = common_rows.to_frame(name='kor')

sum_df = df1.add(df2, fill_value=0)
sum_df = sum_df.add(df3, fill_value=0)
sum_df = sum_df.add(df4, fill_value=0)

# Kontynuuj dodawanie pozostałych DataFrame'ów, jeśli istnieją
sorted_df = sum_df.sort_values(by="kor", ascending=False)
sorted_df = sorted_df.drop(['Stage1', 'Stage2', 'Stage3', 'Stage4'])

sorted_df

Unnamed: 0,kor
ENSG00000103512,0.432537
ENSG00000105507,0.389887
ENSG00000141068,0.367998
ENSG00000151789,0.366568
ENSG00000169756,0.363762
...,...
ENSG00000181061,0.086544
ENSG00000204136,0.086236
ENSG00000188177,0.086077
ENSG00000204424,0.086061


In [112]:
sorted_df = sorted_df.head(100)
sorted_df

Unnamed: 0,kor
ENSG00000103512,0.432537
ENSG00000105507,0.389887
ENSG00000141068,0.367998
ENSG00000151789,0.366568
ENSG00000169756,0.363762
...,...
ENSG00000164919,0.179182
ENSG00000132432,0.179066
ENSG00000114742,0.178787
ENSG00000196954,0.178217


In [113]:
columns_to_drop = merged.columns.difference(sorted_df.index)
merged = merged.drop(columns_to_drop, axis=1)
merged

Unnamed: 0,ENSG00000010404,ENSG00000042753,ENSG00000065518,ENSG00000067365,ENSG00000070190,ENSG00000072135,ENSG00000075945,ENSG00000090266,ENSG00000092841,ENSG00000100307,...,ENSG00000233614,ENSG00000237805,ENSG00000237973,ENSG00000241468,ENSG00000243449,ENSG00000247627,ENSG00000255633,ENSG00000255823,ENSG00000269028,ENSG00000271043
0,5.239637,6.775473,5.106805,4.225813,9.834976,7.025708,7.299722,4.175359,11.317468,3.138127,...,4.810831,4.200844,4.451433,4.274288,3.208891,4.225813,3.980061,4.607119,6.351508,5.381818
1,5.023860,6.601530,5.215529,5.245042,9.701298,6.996397,7.187690,4.971457,11.682834,3.727804,...,6.760702,3.144919,4.800937,4.533515,3.678191,5.006614,3.775195,5.200534,7.217239,5.925721
2,5.799711,6.702580,5.207772,5.490984,10.269704,7.389245,6.444913,4.387452,11.472231,3.640669,...,5.266889,4.387452,4.592966,4.142195,3.967207,5.158595,4.666904,5.577047,7.859787,6.340376
3,5.106419,6.077835,5.804875,5.514482,10.219183,6.722072,7.365816,5.284218,11.139296,3.665726,...,4.889931,4.647027,5.320818,4.800614,3.521482,5.356482,4.617092,5.765927,7.911409,6.342396
4,5.442322,6.263969,5.535445,6.048077,9.628356,6.710531,8.034563,3.674895,11.852632,3.557761,...,6.403371,4.718245,4.389029,3.780065,3.728786,4.570495,4.544175,5.194664,7.236297,5.856442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849,5.959196,6.399837,5.075059,4.886412,10.562064,7.434970,6.433901,4.542353,10.917304,2.377126,...,7.878916,3.864656,2.377126,4.072612,3.608492,2.377126,2.377126,3.260633,3.864656,3.608492
850,5.590662,6.661471,5.487981,4.695609,9.781714,7.279602,8.077307,3.742758,11.565793,3.339275,...,6.983730,4.793104,3.920951,4.300111,3.493546,4.095371,3.625855,4.414827,5.990556,4.793104
851,4.920283,5.868321,5.224042,4.637587,10.511605,7.878992,7.768693,4.637587,10.533684,5.154147,...,7.971523,4.277136,5.290584,3.524623,2.377126,5.779372,4.920283,5.868321,7.868335,6.789048
852,2.377126,7.080031,3.938488,2.377126,8.522682,5.579105,5.509168,4.338161,10.734571,6.929253,...,7.579033,2.377126,2.377126,3.307039,4.154718,3.307039,4.154718,4.769424,6.500167,5.188212


In [114]:
patients = patients.reset_index(drop=True)
# patients

In [115]:
combined_df = patients.join(merged)
combined_df.info()
combined_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 854 entries, 0 to 853
Columns: 105 entries, ID to ENSG00000271043
dtypes: float64(101), object(4)
memory usage: 700.7+ KB


Unnamed: 0,ID,Group,Sex,Age,Stage,ENSG00000010404,ENSG00000042753,ENSG00000065518,ENSG00000067365,ENSG00000070190,...,ENSG00000233614,ENSG00000237805,ENSG00000237973,ENSG00000241468,ENSG00000243449,ENSG00000247627,ENSG00000255633,ENSG00000255823,ENSG00000269028,ENSG00000271043
0,MGH-NSCLC-L-74,NSCLC,M,36.0,IV,5.239637,6.775473,5.106805,4.225813,9.834976,...,4.810831,4.200844,4.451433,4.274288,3.208891,4.225813,3.980061,4.607119,6.351508,5.381818
1,Vumc-NSCLC-092,NSCLC,F,81.0,IV,5.023860,6.601530,5.215529,5.245042,9.701298,...,6.760702,3.144919,4.800937,4.533515,3.678191,5.006614,3.775195,5.200534,7.217239,5.925721
2,MGH-NSCLC-L01-TR458,NSCLC,F,37.0,IV,5.799711,6.702580,5.207772,5.490984,10.269704,...,5.266889,4.387452,4.592966,4.142195,3.967207,5.158595,4.666904,5.577047,7.859787,6.340376
3,MGH-NSCLC-L11-TR477,NSCLC,M,54.0,IV,5.106419,6.077835,5.804875,5.514482,10.219183,...,4.889931,4.647027,5.320818,4.800614,3.521482,5.356482,4.617092,5.765927,7.911409,6.342396
4,MGH-NSCLC-L65-TR523,NSCLC,M,78.0,IV,5.442322,6.263969,5.535445,6.048077,9.628356,...,6.403371,4.718245,4.389029,3.780065,3.728786,4.570495,4.544175,5.194664,7.236297,5.856442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849,VUMC-141-2017432-PANC-TR3421,Pancreatic cancer,M,74.0,III,5.959196,6.399837,5.075059,4.886412,10.562064,...,7.878916,3.864656,2.377126,4.072612,3.608492,2.377126,2.377126,3.260633,3.864656,3.608492
850,MGH-NSCLC-L121-TR907,NSCLC,F,71.0,IV,5.590662,6.661471,5.487981,4.695609,9.781714,...,6.983730,4.793104,3.920951,4.300111,3.493546,4.095371,3.625855,4.414827,5.990556,4.793104
851,Vumc-NSCLC-216-TR874,NSCLC,M,68.0,II,4.920283,5.868321,5.224042,4.637587,10.511605,...,7.971523,4.277136,5.290584,3.524623,2.377126,5.779372,4.920283,5.868321,7.868335,6.789048
852,TR4277,Ovarian cancer,F,67.0,III,2.377126,7.080031,3.938488,2.377126,8.522682,...,7.579033,2.377126,2.377126,3.307039,4.154718,3.307039,4.154718,4.769424,6.500167,5.188212


Save merged dataframe to file

In [116]:
merged = combined_df

In [117]:
cleaned_data_path = '~/Documents/STUDIA/Projekt_badawczy/moje/TEPS_Data_preparation_data_sample_info/cleaned'

merged.to_csv(os.path.join(cleaned_data_path, 'dataset.csv'), index=False, sep=';')

## Train test split

The dataset will be split to train/test sets in a stratified fashion based on two columns: Group and Stage.

Some types of cancer have only one sample with a specific stage, so it's impossible to split them in a stratified way - they will be randomly placed in a train or test set.

In [118]:
merged.insert(1, 'Groups_and_Stages', merged.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1))
groups_and_stages_counts = merged['Groups_and_Stages'].value_counts()
rare_combinations = groups_and_stages_counts[groups_and_stages_counts < 10].index
merged = merged[~merged['Groups_and_Stages'].isin(rare_combinations)]

train, test = train_test_split(merged, test_size=0.3, stratify=merged['Groups_and_Stages'])

merged = merged.drop('Groups_and_Stages', axis=1)
train = train.drop('Groups_and_Stages', axis=1)
test = test.drop('Groups_and_Stages', axis=1)

Now both train and test sets are representative of (almost) every cancer/stage pairs available in the dataset.

In [119]:
train.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts()

NSCLC_IV                       231
Pancreatic_cancer_II            45
Colorectal_cancer_IV            38
NSCLC_III                       32
Cholangiocarcinoma_IV           31
Ovarian_cancer_III              30
Ovarian_cancer_IV               24
Ovarian_cancer_I                22
Pancreatic_cancer_III           22
Pancreatic_cancer_IV            17
Endometrial_cancer_I            17
Cholangiocarcinoma_II           15
Ovarian_cancer_II               12
NSCLC_I                         10
NSCLC_II                         9
Hepatocellular_carcinoma_IV      7
dtype: int64

In [120]:
test.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts()

NSCLC_IV                       99
Pancreatic_cancer_II           19
Colorectal_cancer_IV           16
NSCLC_III                      14
Cholangiocarcinoma_IV          13
Ovarian_cancer_III             13
Ovarian_cancer_IV              11
Pancreatic_cancer_III           9
Ovarian_cancer_I                9
Pancreatic_cancer_IV            7
Cholangiocarcinoma_II           7
Endometrial_cancer_I            7
NSCLC_I                         5
Ovarian_cancer_II               5
NSCLC_II                        4
Hepatocellular_carcinoma_IV     3
dtype: int64

In [121]:
print(f'Train set size: {len(train)}')
print(f'Test set size: {len(test)}')

Train set size: 562
Test set size: 241


Save train/test sets to files

In [122]:
train.to_csv(os.path.join(cleaned_data_path, 'train.csv'), index=False, sep=';')
test.to_csv(os.path.join(cleaned_data_path, 'test.csv'), index=False, sep=';')