# Data preparation (updated data)

Notebook with basic preparation of updated data and train/test split for future experiments

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
random_state = 42

In [3]:
raw_data_path = '../data/raw/updated/'
cleaned_data_path = '../data/cleaned/updated/all_features'
cleaned_filtered_data_path = '../data/cleaned/updated/stable_features'

## Data loading and preparation

File with information about patients

In [4]:
patients = pd.read_csv(os.path.join(raw_data_path, 'samplesStage.csv'), sep='\t')
patients = patients.drop(columns=['X', 'Group', 'stage'])
patients = patients.rename(columns={'id': 'ID', 'StageGroupped': 'Stage', 'ori_patientgroup': 'Group'})

patients.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1161 entries, 331 to 2313
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   1161 non-null   object 
 1   Group                1161 non-null   object 
 2   Age                  1161 non-null   object 
 3   Stage                1161 non-null   object 
 4   Sex                  1161 non-null   object 
 5   RealLocation         1161 non-null   object 
 6   isTraining           1161 non-null   int64  
 7   isValidation         1161 non-null   int64  
 8   isTest               1161 non-null   int64  
 9   lib.size             1161 non-null   int64  
 10  originalScoreBinary  1161 non-null   float64
 11  GroupAlternative     1161 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 117.9+ KB


In [5]:
patients['Stage'].value_counts(dropna=False)

IV     720
III    197
II     151
I       93
Name: Stage, dtype: int64

In [6]:
patients['Group'].value_counts(dropna=False)

NSCLC                  345
ovarianCancer          129
PDAC                   125
headAndNeck            101
breastCancer            91
Cholangiocarcinoma      82
CRC                     67
Melanoma                66
endometrialCancer       36
renalCellCarcinoma      28
urothelialCarcinoma     28
hodgkinLymphoma         18
prostateCancer          15
HCC                     15
esophagus               12
Sarcoma                  2
chronPancreatitis        1
Name: Group, dtype: int64

In [7]:
patients['Age'] = pd.to_numeric(patients['Age'], errors='coerce')

cols = [
    'ID',
    'Group',
    'Sex',
    'Age',
    'Stage'
]
patients = patients.loc[:, cols]
patients.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1161 entries, 331 to 2313
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      1161 non-null   object 
 1   Group   1161 non-null   object 
 2   Sex     1161 non-null   object 
 3   Age     1160 non-null   float64
 4   Stage   1161 non-null   object 
dtypes: float64(1), object(4)
memory usage: 54.4+ KB


List of prefiltered columns

In [8]:
stable_features = pd.read_csv(os.path.join(raw_data_path, 'features_stable.csv'), sep='\t', names=['feature_names'])
stable_features = stable_features['feature_names']

stable_features.info()

<class 'pandas.core.series.Series'>
RangeIndex: 757 entries, 0 to 756
Series name: feature_names
Non-Null Count  Dtype 
--------------  ----- 
757 non-null    object
dtypes: object(1)
memory usage: 6.0+ KB


File with marker values

In [9]:
markers = pd.read_csv(os.path.join(raw_data_path, 'countsStage.csv'), sep='\t')
markers = markers.T.reset_index(names='ID')

markers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161 entries, 0 to 1160
Columns: 5327 entries, ID to CASC15
dtypes: float64(5326), object(1)
memory usage: 47.2+ MB


In [10]:
features_to_keep = stable_features.values.tolist() + ['ID']
markers_filtered = markers.loc[:, features_to_keep]

markers_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161 entries, 0 to 1160
Columns: 758 entries, MEST to ID
dtypes: float64(757), object(1)
memory usage: 6.7+ MB


In [11]:
df = patients.merge(markers, on='ID', how='inner')
df.to_csv(os.path.join(cleaned_data_path, 'dataset.csv'), index=False, sep=';')

In [12]:
df_filtered = patients.merge(markers_filtered, on='ID', how='inner')
df_filtered.to_csv(os.path.join(cleaned_filtered_data_path, 'dataset.csv'), index=False, sep=';')

## Train test split

In [13]:
groups_and_stages = patients.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1)
groups_and_stages_counts = groups_and_stages.value_counts()
groups_and_stages[groups_and_stages.isin(groups_and_stages_counts[groups_and_stages_counts == 1].index)] = 'temp'

train_patients, test_patients = train_test_split(patients, test_size=0.25, stratify=groups_and_stages, random_state=random_state)

In [14]:
train_patients.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts(normalize=True)

NSCLC_IV                   0.252874
PDAC_II                    0.056322
headAndNeck_IV             0.051724
CRC_IV                     0.049425
breastCancer_IV            0.049425
Melanoma_IV                0.049425
ovarianCancer_III          0.041379
Cholangiocarcinoma_IV      0.039080
NSCLC_III                  0.031034
ovarianCancer_IV           0.029885
PDAC_III                   0.026437
headAndNeck_III            0.025287
ovarianCancer_I            0.025287
renalCellCarcinoma_IV      0.024138
urothelialCarcinoma_IV     0.021839
PDAC_IV                    0.021839
endometrialCancer_I        0.020690
Cholangiocarcinoma_II      0.018391
breastCancer_II            0.014943
ovarianCancer_II           0.013793
prostateCancer_IV          0.012644
breastCancer_I             0.010345
HCC_IV                     0.009195
esophagus_III              0.008046
Cholangiocarcinoma_III     0.008046
NSCLC_I                    0.008046
endometrialCancer_III      0.006897
NSCLC_II                   0

In [16]:
test_patients.apply(lambda row: f"{row['Group'].replace(' ', '_')}_{row['Stage']}", axis=1).value_counts(normalize=True)

NSCLC_IV                  0.250859
PDAC_II                   0.058419
CRC_IV                    0.051546
Melanoma_IV               0.051546
headAndNeck_IV            0.051546
breastCancer_IV           0.048110
ovarianCancer_III         0.041237
Cholangiocarcinoma_IV     0.037801
ovarianCancer_IV          0.030928
NSCLC_III                 0.030928
ovarianCancer_I           0.027491
headAndNeck_III           0.027491
PDAC_III                  0.027491
renalCellCarcinoma_IV     0.024055
urothelialCarcinoma_IV    0.024055
Cholangiocarcinoma_II     0.020619
endometrialCancer_I       0.020619
PDAC_IV                   0.020619
breastCancer_II           0.017182
prostateCancer_IV         0.013746
ovarianCancer_II          0.013746
esophagus_III             0.010309
breastCancer_I            0.010309
NSCLC_I                   0.010309
HCC_IV                    0.010309
headAndNeck_II            0.006873
endometrialCancer_III     0.006873
Cholangiocarcinoma_III    0.006873
endometrialCancer_II

In [17]:
print(f'Train set size: {len(train_patients)}')
print(f'Test set size: {len(test_patients)}')

Train set size: 870
Test set size: 291


In [18]:
train = train_patients.merge(markers, on='ID', how='inner')
test = test_patients.merge(markers, on='ID', how='inner')

train.to_csv(os.path.join(cleaned_data_path, 'train.csv'), index=False, sep=';')
test.to_csv(os.path.join(cleaned_data_path, 'test.csv'), index=False, sep=';')

In [19]:
train_filtered = train_patients.merge(markers_filtered, on='ID', how='inner')
test_filtered = test_patients.merge(markers_filtered, on='ID', how='inner')

train_filtered.to_csv(os.path.join(cleaned_filtered_data_path, 'train.csv'), index=False, sep=';')
test_filtered.to_csv(os.path.join(cleaned_filtered_data_path, 'test.csv'), index=False, sep=';')

### Split samples with the most numerous types of cancer to separate datasets

In [6]:
train = pd.read_csv(os.path.join(cleaned_data_path, 'train.csv'), sep=';')
test = pd.read_csv(os.path.join(cleaned_data_path, 'test.csv'), sep=';')

In [10]:
group_counts = train['Group'].value_counts()
top_groups = group_counts[group_counts > 80]
top_groups

NSCLC            259
ovarianCancer     96
PDAC              93
Name: Group, dtype: int64

In [13]:
for group_name in top_groups.index:
    train_group = train.loc[train['Group'] == group_name]
    test_group = test.loc[test['Group'] == group_name]
    
    group_dataset_path = os.path.join(cleaned_data_path, group_name)
    if not os.path.isdir(group_dataset_path):
        os.mkdir(group_dataset_path)
    
    train_group.to_csv(os.path.join(group_dataset_path, 'train.csv'), index=False, sep=';')
    test_group.to_csv(os.path.join(group_dataset_path, 'test.csv'), index=False, sep=';')