# Data Preparation

Preparing extracted features, clinical and PET params for modelling.

In [1]:
import sys
sys.path.append('../src/')

#!pip install joblib

import os
import feature_postprep

import numpy as np
import pandas as pd

## Prep targets

In [2]:
raw_targets = pd.read_excel(
    './../../data/target/Responsdata-Radiomics-10092018.xlsx', 
    index_col=0
)
raw_targets.head()

Unnamed: 0_level_0,Unnamed: 1,Lokal,Regional,LRC,Unnamed: 5,PFS,Unnamed: 7,Unnamed: 8,Unnamed: 9,Respons,Forklaring,Unnamed: 12,Unnamed: 13,Antall,Unnamed: 15
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,,0,0,0,,0,,,,LRC,Lokoregional kontroll,,,49.0,(Dvs. 49 pasienter har lokalt og/eller regiona...
4,,0,0,0,,0,,,,PFS,Progresjonsfri overlevelse,,,64.0,(Dvs. 64 pasienter har lokalt og/eller regiona...
5,,0,0,0,,0,,,,,,,,,
8,,0,0,0,,0,,,,,,,,,
10,,0,0,0,,0,,,,,,,,,


In [3]:
raw_targets.columns

Index(['Unnamed: 1', 'Lokal', 'Regional ', 'LRC', 'Unnamed: 5', 'PFS',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Respons', 'Forklaring',
       'Unnamed: 12', 'Unnamed: 13', 'Antall ', 'Unnamed: 15'],
      dtype='object')

In [4]:
raw_targets.index

Int64Index([  2,   4,   5,   8,  10,  11,  12,  13,  14,  15,
            ...
            243, 244, 246, 247, 248, 249, 250, 252, 253, 254],
           dtype='int64', name='ID', length=198)

In [5]:
pfs = raw_targets.loc[:, 'PFS'].values
lrc = raw_targets.loc[:, 'LRC'].values

y_pfs = np.zeros(np.shape(pfs), dtype=int)
y_lrc = np.zeros(np.shape(lrc), dtype=int)
#y_lrc[pfs == 0] = 1, y[pfs == 0] = 1
y_pfs = np.array(pfs, dtype=int)
y_lrc = np.array(lrc, dtype=int)

In [6]:
target_pfs = pd.DataFrame(y_pfs, columns=['pfs'], index=raw_targets.index)
target_pfs.head()

Unnamed: 0_level_0,pfs
ID,Unnamed: 1_level_1
2,0
4,0
5,0
8,0
10,0


In [7]:
target_lrc = pd.DataFrame(y_lrc, columns=['lrc'], index=raw_targets.index)
target_lrc.head()

Unnamed: 0_level_0,lrc
ID,Unnamed: 1_level_1
2,0
4,0
5,0
8,0
10,0


In [8]:
target_lrc.to_csv('./../../data/to_analysis/target_lrc.csv', sep=',')
target_pfs.to_csv('./../../data/to_analysis/target_pfs.csv', sep=',')

In [9]:
# Sanity check
sum(target_lrc.index != target_pfs.index) == 0

True

## Prep clinical data and PET params

In [10]:
pet_params = pd.read_excel('./../../data/tabular/pet_params.xlsx')
pet_params.head()

Unnamed: 0,'patientID','volume','SUVmax','SUVpeak','MTV','TLG',Unnamed: 6,%EXPLANATIONS:
0,1,0.0,,,,,,%volume: Volume of primary tumor [cm3](alread...
1,2,17.142,24.7412,21.616549,7.384,124.870726,,%SUVmax: Maximum SUV in the primary tumor vol...
2,3,0.0,,,,,,%SUVpeak: Maximum mean value of SUV in a spher...
3,4,9.661,18.6557,15.296275,3.406,41.554406,,% where the center of the sphere (regi...
4,5,16.214,16.7395,14.473272,7.934,86.22842,,% must belong to the tumor volume.


In [11]:
prep_pet_params = pet_params.copy()

In [12]:
prep_pet_params.index = prep_pet_params[prep_pet_params.columns[0]]
prep_pet_params.index.name = 'patient'
prep_pet_params.columns = [
    'X', 'volume', 'suv_max', 'suv_peak', 'mtv', 'tlg', 'Y', 'Z'
]
prep_pet_params.head()

Unnamed: 0_level_0,X,volume,suv_max,suv_peak,mtv,tlg,Y,Z
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0.0,,,,,,%volume: Volume of primary tumor [cm3](alread...
2,2,17.142,24.7412,21.616549,7.384,124.870726,,%SUVmax: Maximum SUV in the primary tumor vol...
3,3,0.0,,,,,,%SUVpeak: Maximum mean value of SUV in a spher...
4,4,9.661,18.6557,15.296275,3.406,41.554406,,% where the center of the sphere (regi...
5,5,16.214,16.7395,14.473272,7.934,86.22842,,% must belong to the tumor volume.


In [13]:
# Dropping features
prep_pet_params.drop(
    ['X', 'Y', 'Z', 'suv_max', 'volume'], axis=1, inplace=True
)

In [14]:
# Dropping NaNs
prep_pet_params.dropna(axis=0, inplace=True)
prep_pet_params.head()

Unnamed: 0_level_0,suv_peak,mtv,tlg
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,21.616549,7.384,124.870726
4,15.296275,3.406,41.554406
5,14.473272,7.934,86.22842
8,10.510859,26.926,205.413389
10,7.21319,6.041,32.10377


In [15]:
# Filtering with target cases
pet_params = prep_pet_params.loc[target_lrc.index, :]
pet_params.to_csv('./../../data/to_analysis/pet_params.csv')

In [16]:
raw_clinical = pd.read_excel(
    './../../data/tabular/clinical_params.xlsx', index_col=0, header=0
)
raw_clinical.head()

Unnamed: 0_level_0,Alder,Kjønn,ICD10 kort,T-klassifisering,N-klassifisering,Stadium,Histologi,HPV-status,ECOG,Charlson,Pakkeår,Naxogin dager,Cisplatin
Pasient-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,57.854795,M,C01,4,2,4a,1.0,2,1,0,32.141096,29.0,0
2,60.832877,M,C09,2,2,4a,0.0,2,0,0,0.0,39.0,6
3,75.663014,M,C32,2,0,2,1.0,2,0,0,55.663014,41.0,0
4,49.906849,K,C09,2,2,4a,0.0,2,0,0,4.786027,33.0,5
5,54.238356,K,C09,2,0,2,0.0,0,0,0,0.0,42.0,0


In [17]:
prep_clinical = raw_clinical.copy()

In [18]:
prep_clinical.index.name = 'patient'
prep_clinical.columns = [
    'age', 'sex', 'icd10', 't_class', 'n_class', 'stage', 'histology', 'hpv', 
    'ecog', 'charlson', 'year_smoking', 'days_naxogin', 'cisplatin'
]
prep_clinical.head()

Unnamed: 0_level_0,age,sex,icd10,t_class,n_class,stage,histology,hpv,ecog,charlson,year_smoking,days_naxogin,cisplatin
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,57.854795,M,C01,4,2,4a,1.0,2,1,0,32.141096,29.0,0
2,60.832877,M,C09,2,2,4a,0.0,2,0,0,0.0,39.0,6
3,75.663014,M,C32,2,0,2,1.0,2,0,0,55.663014,41.0,0
4,49.906849,K,C09,2,2,4a,0.0,2,0,0,4.786027,33.0,5
5,54.238356,K,C09,2,0,2,0.0,0,0,0,0.0,42.0,0


In [19]:
# NOTE: Stage feature req extra attention.
to_encode = [
    'sex', 'icd10', 't_class', 'n_class', 'histology', 'hpv', 
    'ecog', 'charlson', 'cisplatin'
]

for num, var_to_encode in enumerate(to_encode):
    dummy_var = pd.get_dummies(prep_clinical[var_to_encode], drop_first=True)
    dummy_var.columns = [
        '{}_{}'.format(var_to_encode, column) for column in dummy_var.columns
    ]
    prep_clinical[dummy_var.columns] = dummy_var

In [20]:
prep_clinical.columns

Index(['age', 'sex', 'icd10', 't_class', 'n_class', 'stage', 'histology',
       'hpv', 'ecog', 'charlson', 'year_smoking', 'days_naxogin', 'cisplatin',
       'sex_M', 'icd10_C02', 'icd10_C03', 'icd10_C04', 'icd10_C05',
       'icd10_C06', 'icd10_C09', 'icd10_C10', 'icd10_C12', 'icd10_C13',
       'icd10_C32', 't_class_2', 't_class_3', 't_class_4', 'n_class_1',
       'n_class_2', 'n_class_3', 'histology_0.5', 'histology_1.0',
       'histology_2.0', 'histology_3.0', 'hpv_1', 'hpv_2', 'ecog_1', 'ecog_2',
       'ecog_3', 'charlson_1', 'charlson_2', 'charlson_3', 'charlson_4',
       'charlson_5', 'charlson_6', 'cisplatin_1', 'cisplatin_2', 'cisplatin_3',
       'cisplatin_4', 'cisplatin_5', 'cisplatin_6', 'cisplatin_7'],
      dtype='object')

In [21]:
prep_clinical.head()

Unnamed: 0_level_0,age,sex,icd10,t_class,n_class,stage,histology,hpv,ecog,charlson,...,charlson_4,charlson_5,charlson_6,cisplatin_1,cisplatin_2,cisplatin_3,cisplatin_4,cisplatin_5,cisplatin_6,cisplatin_7
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,57.854795,M,C01,4,2,4a,1.0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
2,60.832877,M,C09,2,2,4a,0.0,2,0,0,...,0,0,0,0,0,0,0,0,1,0
3,75.663014,M,C32,2,0,2,1.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.906849,K,C09,2,2,4a,0.0,2,0,0,...,0,0,0,0,0,0,0,1,0,0
5,54.238356,K,C09,2,0,2,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Drop encdoded features
prep_clinical.drop(to_encode, axis=1, inplace=True)
prep_clinical.head()

Unnamed: 0_level_0,age,stage,year_smoking,days_naxogin,sex_M,icd10_C02,icd10_C03,icd10_C04,icd10_C05,icd10_C06,...,charlson_4,charlson_5,charlson_6,cisplatin_1,cisplatin_2,cisplatin_3,cisplatin_4,cisplatin_5,cisplatin_6,cisplatin_7
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,57.854795,4a,32.141096,29.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60.832877,4a,0.0,39.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,75.663014,2,55.663014,41.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.906849,4a,4.786027,33.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,54.238356,2,0.0,42.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
prep_clinical.stage.replace(['4a', '4b', '4c', '4a?'], 4, inplace=True)
stadium_enc = pd.get_dummies(prep_clinical.stage, drop_first=True)
for col in stadium_enc:
    prep_clinical['stage_{}'.format(col)] = stadium_enc[col]

prep_clinical.head()

Unnamed: 0_level_0,age,stage,year_smoking,days_naxogin,sex_M,icd10_C02,icd10_C03,icd10_C04,icd10_C05,icd10_C06,...,cisplatin_2,cisplatin_3,cisplatin_4,cisplatin_5,cisplatin_6,cisplatin_7,stage_1,stage_2,stage_3,stage_4
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,57.854795,4,32.141096,29.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,60.832877,4,0.0,39.0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,75.663014,2,55.663014,41.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,49.906849,4,4.786027,33.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
5,54.238356,2,0.0,42.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [24]:
prep_clinical.drop('stage', axis=1, inplace=True)
prep_clinical.head()

Unnamed: 0_level_0,age,year_smoking,days_naxogin,sex_M,icd10_C02,icd10_C03,icd10_C04,icd10_C05,icd10_C06,icd10_C09,...,cisplatin_2,cisplatin_3,cisplatin_4,cisplatin_5,cisplatin_6,cisplatin_7,stage_1,stage_2,stage_3,stage_4
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,57.854795,32.141096,29.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,60.832877,0.0,39.0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
3,75.663014,55.663014,41.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,49.906849,4.786027,33.0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
5,54.238356,0.0,42.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [25]:
# Filtering with target indices
prep_clinical_reidx = prep_clinical.loc[target_lrc.index, :]

In [26]:
prep_clinical_reidx.to_csv('./../../data/to_analysis/clinical_vars.csv')

In [27]:
# Concatenate with PET params
clinical_params = pd.concat((prep_clinical_reidx, pet_params), axis=1)
clinical_params.to_csv('./../../data/to_analysis/all_clinical_vars.csv')

## Prep image data

In [28]:
dir_pet_features = './../../data/outputs/pet_features/'
dir_ct_features = './../../data/outputs/ct_features/'

path_pet_features = [
    os.path.join(dir_pet_features, fname) 
    for fname in os.listdir(dir_pet_features) if 'pet' in fname
]
path_ct_features = [
    os.path.join(dir_ct_features, fname) 
    for fname in os.listdir(dir_ct_features) if 'ct' in fname
]
path_pet_features.sort(), path_ct_features.sort()

(None, None)

In [29]:
print('CT feature sets:\n{}'.format('-' * 50))
for path_ct in path_ct_features:
    print(path_ct)

CT feature sets:
--------------------------------------------------
./../../data/outputs/ct_features/raw_ct_features1.csv
./../../data/outputs/ct_features/raw_ct_features2.csv
./../../data/outputs/ct_features/raw_ct_features3.csv
./../../data/outputs/ct_features/raw_ct_features4.csv
./../../data/outputs/ct_features/raw_ct_features5.csv


In [30]:
print('PET feature sets:\n{}'.format('-' * 50))
for path_pet in path_pet_features:
    print(path_pet)

PET feature sets:
--------------------------------------------------
./../../data/outputs/pet_features/raw_pet_features1.csv
./../../data/outputs/pet_features/raw_pet_features2.csv
./../../data/outputs/pet_features/raw_pet_features3.csv
./../../data/outputs/pet_features/raw_pet_features4.csv
./../../data/outputs/pet_features/raw_pet_features5.csv


In [31]:
raw_pet_feats = {
    num: pd.read_csv(path_pet, index_col=0) for num, path_pet in enumerate(path_pet_features)
}
raw_ct_feats = {
    num: pd.read_csv(path_ct, index_col=0) for num, path_ct in enumerate(path_ct_features)
}

In [32]:
print('Shapes feature sets:\nCT: {}\nPET: {}'.format(
    raw_pet_feats[0].shape, raw_ct_feats[0].shape
))

Shapes feature sets:
CT: (198, 1849)
PET: (198, 1849)


* Number of missing PET values:

In [33]:
for feat_key, feat_set in raw_pet_feats.items():
    print('Feature set: {}\n{}'.format(feat_key, '-' * 20))
    nans = feat_set.isnull().sum()
    for num, num_missing in enumerate(nans):
        if num_missing > 0:
            print('* Feature: {}\n* Num missing: {}'.format(feat_set.columns[num], num_missing))
    print()

Feature set: 0
--------------------
* Feature: Reader
* Num missing: 198
* Feature: gradient_ngtdm_Contrast
* Num missing: 3
* Feature: label
* Num missing: 198

Feature set: 1
--------------------
* Feature: Reader
* Num missing: 198
* Feature: label
* Num missing: 198

Feature set: 2
--------------------
* Feature: Reader
* Num missing: 198
* Feature: label
* Num missing: 198

Feature set: 3
--------------------
* Feature: Reader
* Num missing: 198
* Feature: label
* Num missing: 198

Feature set: 4
--------------------
* Feature: Reader
* Num missing: 198
* Feature: label
* Num missing: 198



NOTE: 
* Drop only `gradient_ngtdm_Contrast` from first feature set.

In [34]:
pet_processor = feature_postprep.PostProcessor(path_pet_features, verbose=1)
pet_processor.filter_columns(
    keys=[0], columns=['gradient_ngtdm_Contrast']
)

Dropped columns: 1


<feature_postprep.PostProcessor at 0x111c034e0>

* Number of missing CT values:

In [35]:
for feat_num, feat_set in enumerate(raw_ct_feats.values()):
    print('Feature set: {}\n{}'.format(feat_num, '-' * 20))
    nans = feat_set.isnull().sum()
    for num, num_missing in enumerate(nans):
        if num_missing > 0:
            print('* Feature: {}\n* Num missing: {}'.format(feat_set.columns[num], num_missing))
    print()

Feature set: 0
--------------------
* Feature: Reader
* Num missing: 198
* Feature: exponential_ngtdm_Contrast
* Num missing: 129
* Feature: lbp-3D-m1_ngtdm_Contrast
* Num missing: 198
* Feature: lbp-3D-m2_ngtdm_Contrast
* Num missing: 198
* Feature: label
* Num missing: 198

Feature set: 1
--------------------
* Feature: Reader
* Num missing: 198
* Feature: exponential_ngtdm_Contrast
* Num missing: 88
* Feature: lbp-3D-m1_ngtdm_Contrast
* Num missing: 198
* Feature: lbp-3D-m2_ngtdm_Contrast
* Num missing: 198
* Feature: label
* Num missing: 198

Feature set: 2
--------------------
* Feature: Reader
* Num missing: 198
* Feature: exponential_ngtdm_Contrast
* Num missing: 61
* Feature: lbp-3D-m2_ngtdm_Contrast
* Num missing: 32
* Feature: label
* Num missing: 198

Feature set: 3
--------------------
* Feature: Reader
* Num missing: 198
* Feature: exponential_ngtdm_Contrast
* Num missing: 42
* Feature: label
* Num missing: 198

Feature set: 4
--------------------
* Feature: Reader
* Num m

NOTE:
* Drop `exponential_ngtdm_Contrast` from four first feature sets.
* Drop `lbp-3D-m1_ngtdm_Contrast` and `lbp-3D-m2_ngtdm_Contrast` from first and second feature set.

In [36]:
ct_processor = feature_postprep.PostProcessor(path_ct_features, verbose=1)
ct_processor.filter_columns(columns=['exponential_ngtdm_Contrast'])
ct_processor.filter_columns(
    keys=[0, 1], columns=["lbp-3D-m1_ngtdm_Contrast"]
)
ct_processor.filter_columns(
    keys=[0, 1, 2], columns=["lbp-3D-m2_ngtdm_Contrast"]
)

Dropped columns: 1
Dropped columns: 1
Dropped columns: 1
Dropped columns: 1
Dropped columns: 1
Dropped columns: 1
Dropped columns: 1
Dropped columns: 1
Dropped columns: 1
Dropped columns: 1


<feature_postprep.PostProcessor at 0x111c033c8>

Processing:
* Remove columns with too many missing, impute with zero otherwise. 
* Drop columns with constant values.

In [37]:
# Filter default columns
ct_processor.filter_columns()

Dropped 22 default columns
Dropped 22 default columns
Dropped 22 default columns
Dropped 22 default columns
Dropped 22 default columns


<feature_postprep.PostProcessor at 0x111c033c8>

In [38]:
# Filter default columns and replace missing values with zero.
pet_processor.check_features()

Dropped 22 default columns
Dropped 22 default columns
Dropped 22 default columns
Dropped 22 default columns
Dropped 22 default columns


In [39]:
ct_processor.data[0].shape, pet_processor.data[0].shape

((198, 1828), (198, 1830))

In [40]:
pet_processor.filter_constant_features()
print('-' * 30)
ct_processor.filter_constant_features()

Dropped constant columns: 0
Dropped constant columns: 0
Dropped constant columns: 0
Dropped constant columns: 0
Dropped constant columns: 0
------------------------------
Dropped constant columns: 82
Dropped constant columns: 82
Dropped constant columns: 2
Dropped constant columns: 2
Dropped constant columns: 2


<feature_postprep.PostProcessor at 0x111c033c8>

In [41]:
# Num constant PET params
keys = ['0_constant', '1_constant', '2_constant', '3_constant', '4_constant']
for key in keys:
    print(key, ':', len(pet_processor.dropped_cols[key]))

0_constant : 0
1_constant : 0
2_constant : 0
3_constant : 0
4_constant : 0


In [42]:
# Num constant CT params
keys = ['0_constant', '1_constant', '2_constant', '3_constant', '4_constant']
for key in keys:
    print(key, ':', len(ct_processor.dropped_cols[key]))

0_constant : 82
1_constant : 82
2_constant : 2
3_constant : 2
4_constant : 2


In [43]:
pet_processor.check_identifiers(id_col='Patient', target_id=target_lrc.index)
ct_processor.check_identifiers(id_col='Patient', target_id=target_lrc.index)

pet_processor.rename_columns(add_extend='pet')
ct_processor.rename_columns(add_extend='ct')

<feature_postprep.PostProcessor at 0x111c033c8>

## To Analysis

In [44]:
ref_path = './../../data/to_analysis/'
target_dirs = [
    dir_label for dir_label in os.listdir(ref_path) 
    if not dir_label.startswith('.') and not dir_label.endswith('.csv')
]
for ct_key, ct_data in ct_processor.data.items():
    for pet_key, pet_data in pet_processor.data.items():
    
        # NOTE: Same shape features for each filter set as 
        # shape features are independent of filters. Shape features are
        # calcualted from the mask which is identical to PET and CT.
        # Hence, PET and CT shape features are identical.
        ct_shape_features = ct_data.filter(regex='shape')
        pet_shape_features = pet_data.filter(regex='shape')

        # NOTE: Same dir name as filter label regex.
        for target_dir in target_dirs:
    
            ct_filtered = ct_data.filter(regex=target_dir)
            pet_filtered = pet_data.filter(regex=target_dir)
            
            if target_dir == 'original':
                data_combined = pd.concat(
                    (ct_filtered, pet_filtered, clinical_params), axis=1
                )
            else:
                data_combined = pd.concat(
                    (
                        ct_shape_features, ct_filtered, pet_filtered, clinical_params
                    ), axis=1
                )
            ref_target_dir = os.path.join(ref_path, target_dir)
            fname = 'ct{}_pet{}_clinical.csv'.format(ct_key, pet_key)
            data_combined.to_csv(os.path.join(ref_target_dir, fname)) 

In [45]:
ct_filtered.filter(regex='gldm').columns.size

14

In [46]:
ct_shape_features.columns.size, pet_shape_features.columns.size

(10, 10)

In [47]:
"./../../data/to_analysis/target_pfs.csv"

'./../../data/to_analysis/target_pfs.csv'