# Data preparation

Notebook with basic data preparation and train/test split for future experiments

In [423]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from imblearn.over_sampling import RandomOverSampler

## Data loading and preparation

Load raw data (has to be placed in "raw_data_path" directory)

In [424]:
raw_data_path = '~/Documents/STUDIA/Projekt_badawczy/moje/TEPS_Data_preparation_data_sample_info'

patients = pd.read_csv(os.path.join(raw_data_path, 'SampleInfo_short_multiclass_2022-10-14.tsv'), sep='\t')
patients = patients.reset_index(drop=True)

patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1916 entries, 0 to 1915
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sample.ID        1916 non-null   object 
 1   Group            1916 non-null   object 
 2   Stage            1648 non-null   object 
 3   Sex              1898 non-null   object 
 4   Age              1898 non-null   object 
 5   Lib.size         1916 non-null   int64  
 6   Description      107 non-null    object 
 7   Comments         0 non-null      float64
 8   IsNew            1916 non-null   object 
 9   PotentialIssues  36 non-null     object 
 10  TR               1916 non-null   object 
 11  RealLocation     1916 non-null   object 
 12  MultiGroup       1916 non-null   object 
 13  MultiGroup2      1916 non-null   object 
 14  MultiGroup3      1916 non-null   object 
 15  TrainTest        1916 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory usage: 239.6+ KB


Check for different Stage and Group values

In [425]:
patients['Stage'].value_counts(dropna=False)

n.a.    794
IV      497
NaN     268
III     152
II      125
I        80
Name: Stage, dtype: int64

In [426]:
patients['Group'].value_counts(dropna=False)

NSCLC                             567
Asymptomatic controls             405
Pulmonary Hypertension            175
Ovarian cancer                    133
Glioma                            128
Pancreatic cancer                 123
Cholangiocarcinoma                 83
Multiple sclerosis                 83
Colorectal cancer                  80
Medically-intractable epilepsy     43
Endometrial cancer                 38
Angina pectoris                    26
Hepatocellular carcinoma           22
Esophageal carcinoma               10
Name: Group, dtype: int64

Filter out the "Asymptomatic control" values of Group and cases with undefined Stage

In [427]:
valid_stages = ('I', 'II', 'III', 'IV')
patients = patients.loc[patients['Stage'].isin(valid_stages)]

patients = patients[~((patients['Stage'] == 'IV') & (patients['Group'] == 'NSCLC'))]

patients['Stage'].value_counts(dropna=False)

IV     167
III    152
II     125
I       80
Name: Stage, dtype: int64

In [428]:
patients['Group'].value_counts(dropna=False)

Ovarian cancer              126
Pancreatic cancer           122
Cholangiocarcinoma           80
NSCLC                        74
Colorectal cancer            63
Endometrial cancer           36
Hepatocellular carcinoma     14
Esophageal carcinoma          9
Name: Group, dtype: int64

Change data type of Age column to numeric

Change missing value markers to None

Rename column with patient IDs

Select only subset of available columns

In [429]:
patients['Age'] = pd.to_numeric(patients['Age'], errors='coerce')
patients.loc[patients['Sex'] == 'n.a.', 'Sex'] = None
patients = patients.rename(columns={'Sample.ID': 'ID'})

cols = [
    'ID',
    'Group',
    'Sex',
    'Age',
    'Stage'
]
patients = patients.loc[:, cols]
patients.info()
patients

<class 'pandas.core.frame.DataFrame'>
Int64Index: 524 entries, 353 to 1755
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      524 non-null    object 
 1   Group   524 non-null    object 
 2   Sex     524 non-null    object 
 3   Age     523 non-null    float64
 4   Stage   524 non-null    object 
dtypes: float64(1), object(4)
memory usage: 24.6+ KB


Unnamed: 0,ID,Group,Sex,Age,Stage
353,Vumc-NSCLC-859-TR2503,NSCLC,F,54.0,II
354,Vumc-NSCLC-577-TR2483,NSCLC,F,57.0,III
355,MGH-NSCLC-L217-TR2499,NSCLC,M,79.0,III
356,Vumc-exIPMN-3-TR1042,Pancreatic cancer,F,63.0,IV
357,Vumc-PDAC-18-TR986,Pancreatic cancer,M,65.0,I
...,...,...,...,...,...
1507,VU-136-201713693-PANC-TR3409,Pancreatic cancer,M,81.0,IV
1508,VUMC-141-2017432-PANC-TR3421,Pancreatic cancer,M,74.0,III
1637,Vumc-NSCLC-216-TR874,NSCLC,M,68.0,II
1753,TR4277,Ovarian cancer,F,67.0,III


Load and transpose a dataframe with marker values

In [430]:
markers = pd.read_csv(os.path.join(raw_data_path, 'Counts_prefiltered_multiclass_2022-10-14.tsv'), sep='\t')
markers = markers.T.reset_index(names='ID')
markers

Unnamed: 0,ID,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,ENSG00000001631,ENSG00000002330,ENSG00000002549,ENSG00000002586,...,ENSG00000257267,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168
0,Vumc-HD-101-TR922,3.064289,3.834176,4.171537,4.737304,4.272177,3.940969,4.418057,4.668012,10.759357,...,6.788573,8.077838,3.546347,4.328872,4.328872,4.803181,3.128490,7.739557,6.582879,4.418057
1,Vumc-HD-103-TR923,5.194380,6.964049,4.644469,3.838500,3.951551,5.386353,4.537357,5.478881,10.215786,...,6.073116,6.388674,5.337690,4.444854,3.877458,5.152584,5.686621,7.055870,5.815763,3.951551
2,Vumc-HD-108-TR924,5.387337,7.608523,4.097419,3.871438,5.966998,4.877867,4.097419,5.992483,9.772417,...,5.789179,7.257840,4.932819,4.490325,3.807177,4.932819,6.549959,7.091888,6.042124,3.871438
3,Vumc-HD-127-TR925,6.584300,5.626849,5.076153,3.865364,4.355678,5.188931,4.745318,5.215744,9.867106,...,6.150602,5.586682,5.390227,4.627846,4.707302,5.342562,6.746681,7.691876,6.080439,4.920776
4,Vumc-HD-130-TR926,5.684044,5.990387,4.338011,4.072761,4.029651,4.994614,4.693579,5.862317,9.949440,...,6.760555,5.605931,5.292285,4.527899,3.839155,5.188022,3.786024,8.199582,6.456418,5.275442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1911,TR3298-HD-NKI,5.015461,9.866231,6.145266,5.179213,5.179213,5.890240,4.829431,6.662208,9.877926,...,5.015461,8.129104,6.221056,3.575926,5.179213,3.575926,7.724969,3.575926,2.377126,3.575926
1912,TR3481-HD-NKI,6.331082,9.694728,5.875447,5.577790,5.199813,5.401527,5.086781,7.410313,9.361917,...,4.312592,7.583496,6.889398,4.507972,4.507972,3.397669,6.855973,3.397669,3.397669,3.397669
1913,TR3300-HD-NKI,5.133399,10.264108,6.606876,5.278483,4.573871,5.743199,4.787213,8.089368,9.484785,...,5.640600,8.164524,6.167757,4.573871,5.278483,2.377126,7.178693,2.377126,3.551991,2.377126
1914,TR3296-HD-NKI,4.776702,9.413021,5.143608,5.517676,5.517676,6.056900,4.041595,6.753366,9.797785,...,5.940070,7.886525,5.940070,4.461142,5.246952,4.041595,7.406943,3.372442,2.377126,3.372442


In [431]:
col1_values = patients['ID'].tolist()
# usunięcie wierszy, w których wartość w kolumnie ID nie jest zawarta w kolumnie ID obiektu patients
markers = markers[markers['ID'].isin(col1_values)]
markers.info()
markers.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 524 entries, 353 to 1755
Columns: 4394 entries, ID to ENSG00000272168
dtypes: float64(4393), object(1)
memory usage: 17.6+ MB


Unnamed: 0,ID,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,ENSG00000001631,ENSG00000002330,ENSG00000002549,ENSG00000002586,...,ENSG00000257267,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168
353,Vumc-NSCLC-859-TR2503,3.307837,4.57312,3.672354,2.377126,5.096781,4.57312,4.499892,4.57312,10.970645,...,6.205741,7.828871,3.040841,3.939752,3.307837,5.399266,3.040841,6.466265,4.83115,4.499892
354,Vumc-NSCLC-577-TR2483,2.377126,3.145751,4.16549,2.377126,3.679544,3.044703,4.320885,5.008924,10.636164,...,6.305996,8.179829,4.676069,4.19817,3.145751,4.459183,3.451994,6.66485,5.232786,3.776635
355,MGH-NSCLC-L217-TR2499,3.214897,5.574623,3.546366,3.214897,3.0643,3.214897,4.161032,4.525468,11.384643,...,6.203203,8.270375,3.214897,4.050328,3.861865,5.574623,5.220746,5.930566,4.777251,4.212811
356,Vumc-exIPMN-3-TR1042,4.404545,5.523321,5.703005,2.377126,4.781532,5.523321,2.377126,5.523321,9.893723,...,6.135891,6.135891,5.703005,2.377126,4.781532,4.404545,2.377126,8.796359,7.829125,5.075345
357,Vumc-PDAC-18-TR986,3.797501,4.348074,4.539679,4.564876,5.110564,3.46851,4.706699,4.875011,10.746826,...,5.500011,8.065401,3.843513,3.749458,3.699152,4.513986,5.159629,6.510152,5.237725,4.461025


Sprawdzanie które geny mają najwięcej powtarzających się wartości

In [432]:
# list_most_frequent_counts = []

# # iteracja po kolumnach
# for column in markers.columns:
#     # zliczenie wystąpień każdej unikalnej wartości w kolumnie i wybór najczęściej występującej
#     value_counts = markers[column].value_counts()
#     most_frequent_value = value_counts.idxmax()
#     most_frequent_count = value_counts.max()
#     # wyświetlenie informacji o liczbie wystąpień najczęściej powtarzającej się wartości w kolumnie
#     print(f"Najczęściej występująca wartość w kolumnie {column}: {most_frequent_value}")
#     # dodanie słownika z nazwą kolumny i liczbą wystąpień najczęściej powtarzającej się wartości do listy
#     list_most_frequent_counts.append({'Column': column, 'Most Frequent Value': most_frequent_value, 'Count': most_frequent_count})

# # utworzenie obiektu DataFrame z listy słowników
# most_frequent_counts = pd.DataFrame(list_most_frequent_counts)

# # posortowanie DataFrame malejąco względem liczby wystąpień najczęściej powtarzającej się wartości
# most_frequent_counts = most_frequent_counts.sort_values(by='Count', ascending=True)

# most_freq = most_frequent_counts.head(700)

# most_freq

Ograniczenie liczby genów do 200, w których wartości najczęściej się powtarzały

In [433]:
# columns_to_keep = most_freq[most_freq['Column'] != 'ID']['Column'].tolist()
# columns_to_keep = markers[['ID'] + columns_to_keep]
# columns_to_keep

Merge patients with their markers values

In [434]:
# połączenie obiektów patients i markers po kolumnie ID
merged = patients.merge(markers, on='ID', how='left')
merged.info()
merged.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 524 entries, 0 to 523
Columns: 4398 entries, ID to ENSG00000272168
dtypes: float64(4394), object(4)
memory usage: 17.6+ MB


Unnamed: 0,ID,Group,Sex,Age,Stage,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,...,ENSG00000257267,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168
0,Vumc-NSCLC-859-TR2503,NSCLC,F,54.0,II,3.307837,4.57312,3.672354,2.377126,5.096781,...,6.205741,7.828871,3.040841,3.939752,3.307837,5.399266,3.040841,6.466265,4.83115,4.499892
1,Vumc-NSCLC-577-TR2483,NSCLC,F,57.0,III,2.377126,3.145751,4.16549,2.377126,3.679544,...,6.305996,8.179829,4.676069,4.19817,3.145751,4.459183,3.451994,6.66485,5.232786,3.776635
2,MGH-NSCLC-L217-TR2499,NSCLC,M,79.0,III,3.214897,5.574623,3.546366,3.214897,3.0643,...,6.203203,8.270375,3.214897,4.050328,3.861865,5.574623,5.220746,5.930566,4.777251,4.212811
3,Vumc-exIPMN-3-TR1042,Pancreatic cancer,F,63.0,IV,4.404545,5.523321,5.703005,2.377126,4.781532,...,6.135891,6.135891,5.703005,2.377126,4.781532,4.404545,2.377126,8.796359,7.829125,5.075345
4,Vumc-PDAC-18-TR986,Pancreatic cancer,M,65.0,I,3.797501,4.348074,4.539679,4.564876,5.110564,...,5.500011,8.065401,3.843513,3.749458,3.699152,4.513986,5.159629,6.510152,5.237725,4.461025


+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [435]:
merged.insert(1, 'Groups_and_Stages', merged.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1))
groups_and_stages_counts = merged['Groups_and_Stages'].value_counts()
rare_combinations = groups_and_stages_counts[groups_and_stages_counts < 10].index
merged = merged[~merged['Groups_and_Stages'].isin(rare_combinations)]

train, test_valid = train_test_split(merged, test_size=0.3, stratify=merged['Groups_and_Stages'])
test, valid = train_test_split(test_valid, test_size=0.5, stratify=test_valid['Groups_and_Stages'])

valid = test.drop('Groups_and_Stages', axis=1)
test = test.drop('Groups_and_Stages', axis=1)

In [436]:
train

Unnamed: 0,ID,Groups_and_Stages,Group,Sex,Age,Stage,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,...,ENSG00000257267,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168
402,MGH-OVARY-O14-TR657,Ovarian_cancer_IV,Ovarian cancer,F,60.0,IV,4.123579,4.730750,4.598584,4.961433,...,6.599291,8.198407,4.064350,4.180037,3.669296,4.598584,2.879468,7.002271,5.484797,4.730750
468,Vumc-PDAC94-2015610-TR2061,Pancreatic_cancer_II,Pancreatic cancer,F,74.0,II,3.568195,4.244688,4.343439,4.060197,...,6.691788,7.830201,4.244688,4.174209,4.174209,5.456520,2.377126,6.662367,5.601376,4.311378
294,Vumc-NSCLC-222-TR2534,NSCLC_III,NSCLC,M,51.0,III,3.815980,5.237467,3.414736,3.815980,...,6.106538,7.710269,2.377126,4.107672,4.862854,4.341953,4.710946,6.999917,5.342728,4.710946
25,VUMC-47-201712743-PANC-TR3369,Pancreatic_cancer_II,Pancreatic cancer,F,56.0,II,2.377126,2.377126,5.121642,5.298350,...,6.289725,8.211384,4.494010,4.276018,3.668403,5.183082,4.150971,3.442615,4.011545,5.057338
398,MGH-OVARY-602-TR562,Ovarian_cancer_IV,Ovarian cancer,F,65.0,IV,2.377126,4.029537,3.998020,4.651691,...,6.534158,8.398588,3.163482,4.845566,4.090154,4.955841,2.989202,7.490475,5.751358,3.998020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,Cath-Ova-CZE-007-TR2275,Ovarian_cancer_I,Ovarian cancer,F,49.0,I,3.809381,5.971066,5.031634,5.112959,...,6.783052,8.722117,4.010985,4.530373,3.914653,4.701367,4.899811,6.910894,5.660805,3.809381
278,NKI-NSCLC-4202-TR2520,NSCLC_III,NSCLC,F,76.0,III,3.791906,7.430569,3.791906,4.961684,...,6.119035,6.748289,4.675944,3.791906,5.197763,3.791906,7.475150,7.129505,5.399398,2.377126
483,Vumc-PDAC-72-20161131-TR2673,Pancreatic_cancer_II,Pancreatic cancer,M,54.0,II,2.377126,4.344198,4.713482,4.192925,...,6.604351,8.547136,2.377126,4.413523,4.109707,5.621058,3.568695,6.223333,4.658716,3.231315
210,TR3842-CRC-MGH,Colorectal_cancer_IV,Colorectal cancer,M,82.0,IV,2.915058,5.282558,4.000997,4.139707,...,6.794135,8.258427,3.755583,4.203498,5.197275,4.867574,4.867574,3.843513,2.377126,4.377263


In [437]:
merged.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts()

Pancreatic_cancer_II           64
Colorectal_cancer_IV           54
NSCLC_III                      46
Cholangiocarcinoma_IV          44
Ovarian_cancer_III             43
Ovarian_cancer_IV              35
Pancreatic_cancer_III          31
Ovarian_cancer_I               31
Pancreatic_cancer_IV           24
Endometrial_cancer_I           24
Cholangiocarcinoma_II          22
Ovarian_cancer_II              17
NSCLC_I                        15
NSCLC_II                       13
Hepatocellular_carcinoma_IV    10
dtype: int64

In [438]:
train.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts()

Pancreatic_cancer_II           45
Colorectal_cancer_IV           38
NSCLC_III                      32
Cholangiocarcinoma_IV          31
Ovarian_cancer_III             30
Ovarian_cancer_IV              24
Pancreatic_cancer_III          22
Ovarian_cancer_I               22
Endometrial_cancer_I           17
Pancreatic_cancer_IV           17
Cholangiocarcinoma_II          15
Ovarian_cancer_II              12
NSCLC_I                        10
NSCLC_II                        9
Hepatocellular_carcinoma_IV     7
dtype: int64

In [439]:
valid.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts()

Pancreatic_cancer_II           10
Colorectal_cancer_IV            8
Cholangiocarcinoma_IV           7
Ovarian_cancer_III              7
NSCLC_III                       7
Pancreatic_cancer_III           5
Ovarian_cancer_I                5
Ovarian_cancer_IV               5
Cholangiocarcinoma_II           3
Endometrial_cancer_I            3
Pancreatic_cancer_IV            3
Hepatocellular_carcinoma_IV     2
NSCLC_I                         2
NSCLC_II                        2
Ovarian_cancer_II               2
dtype: int64

In [440]:
test.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts()

Pancreatic_cancer_II           10
Colorectal_cancer_IV            8
Cholangiocarcinoma_IV           7
Ovarian_cancer_III              7
NSCLC_III                       7
Pancreatic_cancer_III           5
Ovarian_cancer_I                5
Ovarian_cancer_IV               5
Cholangiocarcinoma_II           3
Endometrial_cancer_I            3
Pancreatic_cancer_IV            3
Hepatocellular_carcinoma_IV     2
NSCLC_I                         2
NSCLC_II                        2
Ovarian_cancer_II               2
dtype: int64

In [441]:
print(f'Train set size: {len(train)}')
print(f'Validation set size: {len(test)}')
print(f'Test set size: {len(test)}')

Train set size: 331
Validation set size: 71
Test set size: 71


In [442]:
cleaned_data_path = '~/Documents/STUDIA/Projekt_badawczy/moje/TEPS_Data_preparation_data_sample_info/cleaned'

valid.to_csv(os.path.join(cleaned_data_path, 'valid.csv'), index=False, sep=';')
test.to_csv(os.path.join(cleaned_data_path, 'test.csv'), index=False, sep=';')

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [443]:
ros = RandomOverSampler(random_state=42)

# Przygotowanie danych do upsamplingu
X = train.drop('Groups_and_Stages', axis=1)  # Przykład: usuwanie kolumny 'Stage' (zależy od struktury danych)
y = train['Groups_and_Stages']

# Upsampling danych
X_upsampled, y_upsampled = ros.fit_resample(X, y)

# Tworzenie zbalansowanego DataFrame
balanced_train = X_upsampled.copy()
balanced_train['Groups_and_Stages'] = y_upsampled

train_up = balanced_train
train_up = train_up.drop('Groups_and_Stages', axis=1)
train_up.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts()


Ovarian_cancer_IV              45
Pancreatic_cancer_II           45
NSCLC_III                      45
Ovarian_cancer_II              45
Cholangiocarcinoma_II          45
Colorectal_cancer_IV           45
Ovarian_cancer_III             45
Endometrial_cancer_I           45
NSCLC_I                        45
Pancreatic_cancer_III          45
Ovarian_cancer_I               45
Cholangiocarcinoma_IV          45
Hepatocellular_carcinoma_IV    45
Pancreatic_cancer_IV           45
NSCLC_II                       45
dtype: int64

In [444]:
train_up2 = train_up
train_up2['Stage1'] = train_up['Stage'].apply(lambda x: 1 if x == 'I' else 0)
train_up2['Stage2'] = train_up['Stage'].apply(lambda x: 1 if x == 'II' else 0)
train_up2['Stage3'] = train_up['Stage'].apply(lambda x: 1 if x == 'III' else 0)
train_up2['Stage4'] = train_up['Stage'].apply(lambda x: 1 if x == 'IV' else 0)
train_up2

Unnamed: 0,ID,Group,Sex,Age,Stage,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,...,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168,Stage1,Stage2,Stage3,Stage4
0,MGH-OVARY-O14-TR657,Ovarian cancer,F,60.0,IV,4.123579,4.730750,4.598584,4.961433,4.817730,...,3.669296,4.598584,2.879468,7.002271,5.484797,4.730750,0,0,0,1
1,Vumc-PDAC94-2015610-TR2061,Pancreatic cancer,F,74.0,II,3.568195,4.244688,4.343439,4.060197,4.676375,...,4.174209,5.456520,2.377126,6.662367,5.601376,4.311378,0,1,0,0
2,Vumc-NSCLC-222-TR2534,NSCLC,M,51.0,III,3.815980,5.237467,3.414736,3.815980,4.107672,...,4.862854,4.341953,4.710946,6.999917,5.342728,4.710946,0,0,1,0
3,VUMC-47-201712743-PANC-TR3369,Pancreatic cancer,F,56.0,II,2.377126,2.377126,5.121642,5.298350,4.764777,...,3.668403,5.183082,4.150971,3.442615,4.011545,5.057338,0,1,0,0
4,MGH-OVARY-602-TR562,Ovarian cancer,F,65.0,IV,2.377126,4.029537,3.998020,4.651691,4.175696,...,4.090154,4.955841,2.989202,7.490475,5.751358,3.998020,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,VUMC-76-2017057-PANC-TR3486,Pancreatic cancer,F,76.0,IV,4.799795,5.993860,3.152253,2.377126,5.280986,...,4.526273,6.158948,2.377126,4.305809,3.152253,4.305809,0,0,0,1
671,Vumc-PDAC-36-TR993,Pancreatic cancer,M,61.0,IV,4.493779,5.385855,4.036269,4.769727,4.373153,...,4.308275,4.394062,4.239865,6.041902,4.971842,4.971842,0,0,0,1
672,Vumc-PDAC-36-TR993,Pancreatic cancer,M,61.0,IV,4.493779,5.385855,4.036269,4.769727,4.373153,...,4.308275,4.394062,4.239865,6.041902,4.971842,4.971842,0,0,0,1
673,VUMC-76-2017057-PANC-TR3486,Pancreatic cancer,F,76.0,IV,4.799795,5.993860,3.152253,2.377126,5.280986,...,4.526273,6.158948,2.377126,4.305809,3.152253,4.305809,0,0,0,1


In [445]:
patients_up = train_up2.iloc[:, :5].copy()
patients_up

Unnamed: 0,ID,Group,Sex,Age,Stage
0,MGH-OVARY-O14-TR657,Ovarian cancer,F,60.0,IV
1,Vumc-PDAC94-2015610-TR2061,Pancreatic cancer,F,74.0,II
2,Vumc-NSCLC-222-TR2534,NSCLC,M,51.0,III
3,VUMC-47-201712743-PANC-TR3369,Pancreatic cancer,F,56.0,II
4,MGH-OVARY-602-TR562,Ovarian cancer,F,65.0,IV
...,...,...,...,...,...
670,VUMC-76-2017057-PANC-TR3486,Pancreatic cancer,F,76.0,IV
671,Vumc-PDAC-36-TR993,Pancreatic cancer,M,61.0,IV
672,Vumc-PDAC-36-TR993,Pancreatic cancer,M,61.0,IV
673,VUMC-76-2017057-PANC-TR3486,Pancreatic cancer,F,76.0,IV


In [446]:
#macierz korelacji
train_up2 = train_up2.drop(["Group", "Stage", "Sex", "Age"], axis=1)
correlation_matrix = train_up2.corr()

  correlation_matrix = train_up2.corr()


In [447]:
# Korelacje dla kazdego stage-a
stage_correlation1 = correlation_matrix['Stage1']
stage_correlation1 = stage_correlation1.sort_values(ascending=False)
stage_correlation1 = stage_correlation1.abs()
stage_correlation1 = stage_correlation1.head(200)

In [448]:
# Korelacje dla kazdego stage-a
stage_correlation2 = correlation_matrix['Stage2']
stage_correlation2 = stage_correlation2.sort_values(ascending=False)
stage_correlation2 = stage_correlation2.abs()
stage_correlation2 = stage_correlation2.head(200)

In [449]:
# Korelacje dla kazdego stage-a
stage_correlation3 = correlation_matrix['Stage3']
stage_correlation3 = stage_correlation3.sort_values(ascending=False)
stage_correlation3 = stage_correlation3.abs()
stage_correlation3 = stage_correlation3.head(200)

In [450]:
# Korelacje dla kazdego stage-a
stage_correlation4 = correlation_matrix['Stage4']
stage_correlation4 = stage_correlation4.sort_values(ascending=False)
stage_correlation4 = stage_correlation4.abs()
stage_correlation4 = stage_correlation4.head(200)

In [451]:
df1 = stage_correlation1.to_frame(name='kor')
df2 = stage_correlation2.to_frame(name='kor')
df3 = stage_correlation3.to_frame(name='kor')
df4 = stage_correlation4.to_frame(name='kor')

# common_rows = df2.merge(df3, left_index=True, right_index=True).index
# sorted_df = common_rows.to_frame(name='kor')

sum_df = df1.add(df2, fill_value=0)
sum_df = sum_df.add(df3, fill_value=0)
sum_df = sum_df.add(df4, fill_value=0)

# Kontynuuj dodawanie pozostałych DataFrame'ów, jeśli istnieją
sorted_df = sum_df.sort_values(by="kor", ascending=False)
sorted_df = sorted_df.drop(['Stage1', 'Stage2', 'Stage3', 'Stage4'])
sorted_df = sorted_df.head(200)
sorted_df

Unnamed: 0,kor
ENSG00000198265,0.442234
ENSG00000126247,0.414595
ENSG00000141068,0.401995
ENSG00000070756,0.396361
ENSG00000145675,0.386396
...,...
ENSG00000091140,0.242659
ENSG00000180182,0.242524
ENSG00000122417,0.242210
ENSG00000166337,0.242135


In [452]:
# common_columns = sorted_df.index.intersection(columns_to_keep.columns)
# new_dataframe = columns_to_keep.loc[:, common_columns]
# new_dataframe

In [453]:
train_up

Unnamed: 0,ID,Group,Sex,Age,Stage,ENSG00000000419,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001629,...,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168,Stage1,Stage2,Stage3,Stage4
0,MGH-OVARY-O14-TR657,Ovarian cancer,F,60.0,IV,4.123579,4.730750,4.598584,4.961433,4.817730,...,3.669296,4.598584,2.879468,7.002271,5.484797,4.730750,0,0,0,1
1,Vumc-PDAC94-2015610-TR2061,Pancreatic cancer,F,74.0,II,3.568195,4.244688,4.343439,4.060197,4.676375,...,4.174209,5.456520,2.377126,6.662367,5.601376,4.311378,0,1,0,0
2,Vumc-NSCLC-222-TR2534,NSCLC,M,51.0,III,3.815980,5.237467,3.414736,3.815980,4.107672,...,4.862854,4.341953,4.710946,6.999917,5.342728,4.710946,0,0,1,0
3,VUMC-47-201712743-PANC-TR3369,Pancreatic cancer,F,56.0,II,2.377126,2.377126,5.121642,5.298350,4.764777,...,3.668403,5.183082,4.150971,3.442615,4.011545,5.057338,0,1,0,0
4,MGH-OVARY-602-TR562,Ovarian cancer,F,65.0,IV,2.377126,4.029537,3.998020,4.651691,4.175696,...,4.090154,4.955841,2.989202,7.490475,5.751358,3.998020,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,VUMC-76-2017057-PANC-TR3486,Pancreatic cancer,F,76.0,IV,4.799795,5.993860,3.152253,2.377126,5.280986,...,4.526273,6.158948,2.377126,4.305809,3.152253,4.305809,0,0,0,1
671,Vumc-PDAC-36-TR993,Pancreatic cancer,M,61.0,IV,4.493779,5.385855,4.036269,4.769727,4.373153,...,4.308275,4.394062,4.239865,6.041902,4.971842,4.971842,0,0,0,1
672,Vumc-PDAC-36-TR993,Pancreatic cancer,M,61.0,IV,4.493779,5.385855,4.036269,4.769727,4.373153,...,4.308275,4.394062,4.239865,6.041902,4.971842,4.971842,0,0,0,1
673,VUMC-76-2017057-PANC-TR3486,Pancreatic cancer,F,76.0,IV,4.799795,5.993860,3.152253,2.377126,5.280986,...,4.526273,6.158948,2.377126,4.305809,3.152253,4.305809,0,0,0,1


In [454]:
columns_to_drop = train_up.columns.difference(sorted_df.index)
train_up_markers = train_up.drop(columns=columns_to_drop)

train_up_markers = train_up_markers.reset_index(drop=True)

train_up_markers


Unnamed: 0,ENSG00000005483,ENSG00000007264,ENSG00000011243,ENSG00000013441,ENSG00000015479,ENSG00000025796,ENSG00000027697,ENSG00000029363,ENSG00000033800,ENSG00000038358,...,ENSG00000196235,ENSG00000196352,ENSG00000197102,ENSG00000197111,ENSG00000198265,ENSG00000198730,ENSG00000204256,ENSG00000206503,ENSG00000215301,ENSG00000234745
0,6.760752,4.637710,4.335376,4.429212,7.721067,5.247634,5.331243,4.406392,5.145481,2.377126,...,4.180037,6.810775,3.424962,4.152136,4.817730,6.859115,4.730750,9.905855,5.942195,10.29367
1,7.086522,3.503420,4.747916,4.405227,7.041573,5.001220,5.732724,4.374708,5.810376,2.377126,...,4.770937,6.430593,3.275728,5.001220,4.626449,7.296173,5.198252,9.230371,5.732724,10.28781
2,6.514789,5.440606,2.377126,4.107672,7.121326,5.848196,4.539463,4.710946,2.377126,3.414736,...,4.862854,4.341953,5.123567,6.220037,5.342728,6.642159,4.999412,10.014330,4.539463,10.35747
3,8.114363,4.389668,2.377126,3.138900,6.793738,4.844042,3.668403,3.853205,5.298350,2.377126,...,4.989877,6.608413,3.853205,6.204885,5.803752,6.233728,4.918914,10.635340,2.377126,11.62183
4,6.494076,4.029537,4.401644,4.060232,6.684140,4.925236,5.178803,4.090154,4.877998,3.476244,...,4.490442,6.625087,3.788190,4.378450,4.829053,5.874179,4.955841,10.298640,5.414900,10.98856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,7.815812,4.305809,3.690108,6.095188,7.328718,5.337842,5.159786,4.799795,6.362052,2.377126,...,4.879803,6.028443,4.179242,5.445050,5.591909,7.243444,5.445050,9.978416,5.993860,10.64125
671,7.683015,4.285885,5.159597,5.904136,7.296633,5.712684,5.631557,5.271934,6.653194,4.722174,...,6.085014,6.447420,5.239177,5.124039,5.518599,6.747888,5.811638,9.308583,6.564880,10.28469
672,7.683015,4.285885,5.159597,5.904136,7.296633,5.712684,5.631557,5.271934,6.653194,4.722174,...,6.085014,6.447420,5.239177,5.124039,5.518599,6.747888,5.811638,9.308583,6.564880,10.28469
673,7.815812,4.305809,3.690108,6.095188,7.328718,5.337842,5.159786,4.799795,6.362052,2.377126,...,4.879803,6.028443,4.179242,5.445050,5.591909,7.243444,5.445050,9.978416,5.993860,10.64125


In [455]:
patients_up = patients_up.reset_index(drop=True)
# patients

In [457]:
combined_df = patients_up.join(train_up_markers)
combined_df.info()
combined_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 675 entries, 0 to 674
Columns: 205 entries, ID to ENSG00000234745
dtypes: float64(201), object(4)
memory usage: 1.1+ MB


Unnamed: 0,ID,Group,Sex,Age,Stage,ENSG00000005483,ENSG00000007264,ENSG00000011243,ENSG00000013441,ENSG00000015479,...,ENSG00000196235,ENSG00000196352,ENSG00000197102,ENSG00000197111,ENSG00000198265,ENSG00000198730,ENSG00000204256,ENSG00000206503,ENSG00000215301,ENSG00000234745
0,MGH-OVARY-O14-TR657,Ovarian cancer,F,60.0,IV,6.760752,4.637710,4.335376,4.429212,7.721067,...,4.180037,6.810775,3.424962,4.152136,4.817730,6.859115,4.730750,9.905855,5.942195,10.29367
1,Vumc-PDAC94-2015610-TR2061,Pancreatic cancer,F,74.0,II,7.086522,3.503420,4.747916,4.405227,7.041573,...,4.770937,6.430593,3.275728,5.001220,4.626449,7.296173,5.198252,9.230371,5.732724,10.28781
2,Vumc-NSCLC-222-TR2534,NSCLC,M,51.0,III,6.514789,5.440606,2.377126,4.107672,7.121326,...,4.862854,4.341953,5.123567,6.220037,5.342728,6.642159,4.999412,10.014330,4.539463,10.35747
3,VUMC-47-201712743-PANC-TR3369,Pancreatic cancer,F,56.0,II,8.114363,4.389668,2.377126,3.138900,6.793738,...,4.989877,6.608413,3.853205,6.204885,5.803752,6.233728,4.918914,10.635340,2.377126,11.62183
4,MGH-OVARY-602-TR562,Ovarian cancer,F,65.0,IV,6.494076,4.029537,4.401644,4.060232,6.684140,...,4.490442,6.625087,3.788190,4.378450,4.829053,5.874179,4.955841,10.298640,5.414900,10.98856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,VUMC-76-2017057-PANC-TR3486,Pancreatic cancer,F,76.0,IV,7.815812,4.305809,3.690108,6.095188,7.328718,...,4.879803,6.028443,4.179242,5.445050,5.591909,7.243444,5.445050,9.978416,5.993860,10.64125
671,Vumc-PDAC-36-TR993,Pancreatic cancer,M,61.0,IV,7.683015,4.285885,5.159597,5.904136,7.296633,...,6.085014,6.447420,5.239177,5.124039,5.518599,6.747888,5.811638,9.308583,6.564880,10.28469
672,Vumc-PDAC-36-TR993,Pancreatic cancer,M,61.0,IV,7.683015,4.285885,5.159597,5.904136,7.296633,...,6.085014,6.447420,5.239177,5.124039,5.518599,6.747888,5.811638,9.308583,6.564880,10.28469
673,VUMC-76-2017057-PANC-TR3486,Pancreatic cancer,F,76.0,IV,7.815812,4.305809,3.690108,6.095188,7.328718,...,4.879803,6.028443,4.179242,5.445050,5.591909,7.243444,5.445050,9.978416,5.993860,10.64125


In [None]:
cleaned_data_path = '~/Documents/STUDIA/Projekt_badawczy/moje/TEPS_Data_preparation_data_sample_info/cleaned'

combined_df.to_csv(os.path.join(cleaned_data_path, 'train.csv'), index=False, sep=';')