# Examples of feature set usage:

In [1]:
from features_set import features_set
import pandas as pd

'Features.xlsx' file contains radiomics features, extracted from Lung1 NSCLC dataset (https://wiki.cancerimagingarchive.net/display/Public/NSCLC-Radiomics). Clinical data for this dataset is available via the same link.

In [2]:
# loading file with Lung1 clinical data
clinical_df = pd.read_csv('./data/features/NSCLC Radiomics Lung1.clinical-version3-Oct 2019.csv')

# adding boolean 1-,1.5-,2-year survival variables
clinical_df['1yearsurvival'] = clinical_df['Survival.time'] > 365*1
clinical_df['1.5yearsurvival'] = clinical_df['Survival.time'] > 365*1.5
clinical_df['2yearsurvival'] = clinical_df['Survival.time'] > 365*2

clinical_df.loc[(clinical_df['Survival.time'] <= 365)&(clinical_df['deadstatus.event']==0), '1yearsurvival'] = None
clinical_df.loc[(clinical_df['Survival.time'] <= 365*1.5)&(clinical_df['deadstatus.event']==0), '1.5yearsurvival'] = None
clinical_df.loc[(clinical_df['Survival.time'] <= 365*2)&(clinical_df['deadstatus.event']==0), '2yearsurvival'] = None

In [3]:
# save the extended clinical data
clinical_df.to_excel('./data/features/extended_clinical_df.xlsx')

In [4]:
# set up the parameters
parameters = {
    'feature_path': './data/features/features.xlsx', # path to csv/xls file with features
    'outcome_path': './data/features/extended_clinical_df.xlsx', #path to csv/xls file with outcome
    'patient_column': 'Patient', # name of column with patient id
    'patient_in_outcome_column': 'PatientID', # name of column with patient id in clinical data file
    'outcome_column': '1yearsurvival' # name of outcome column
}

In [5]:
# initialize feature set
fs = features_set(**parameters)

In [6]:
# checking class labels and patients' names
print ('Class labels: ', fs._class_label)
print ('Patients names: ', fs._patient_name)

Class labels:  [0. 1.]
Patients names:  ['LUNG1-001_20180209_CT_2_GTV-1_mask', 'LUNG1-002_000000_GTV-1_mask', 'LUNG1-002_20180526_CT_1_GTV-1_mask', 'LUNG1-003_000000_GTV-1_mask', 'LUNG1-003_20180209_CT_1_GTV-1_mask', 'LUNG1-003_20180209_CT_1_GTV-2_mask', 'LUNG1-003_20180209_CT_1_GTV-3_mask', 'LUNG1-004_000000_GTV-1_mask', 'LUNG1-006_000000_GTV-1_mask', 'LUNG1-008_000000_GTV-1_mask', 'LUNG1-009_000000_GTV-1_mask', 'LUNG1-010_000000_GTV-1_mask', 'LUNG1-013_000000_GTV-1_mask', 'LUNG1-014_000000_GTV-1_mask', 'LUNG1-016_000000_GTV-1_mask', 'LUNG1-017_000000_GTV-1_mask', 'LUNG1-020_000000_GTV-1_mask', 'LUNG1-021_000000_GTV-1_mask', 'LUNG1-023_000000_GTV-1_mask', 'LUNG1-025_000000_GTV-1_mask', 'LUNG1-026_000000_GTV-1_mask', 'LUNG1-028_000000_GTV-1_mask', 'LUNG1-029_000000_GTV-1_mask', 'LUNG1-039_000000_GTV-1_mask', 'LUNG1-041_000000_GTV-1_mask', 'LUNG1-043_000000_GTV-1_mask', 'LUNG1-053_000000_GTV-1_mask', 'LUNG1-059_000000_GTV-1_mask', 'LUNG1-060_000000_GTV-1_mask', 'LUNG1-064_000000_GTV-1_m

In [7]:
fs._feature_outcome_dataframe

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,wavelet-LLL_gldm_HighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceEmphasis,wavelet-LLL_gldm_LargeDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,wavelet-LLL_gldm_LowGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceEmphasis,wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis,ROI,1yearsurvival
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUNG1-001_20180209_CT_2_GTV-1_mask,0.732658,0.548834,46.151744,84.090588,95.336247,83.186537,95.425364,96.104110,155379.500000,61.609664,...,14462.536758,33.609142,548084.071741,0.002759,0.000139,0.267761,2959.571494,0.000058,GTV-1_mask,1
LUNG1-002_000000_GTV-1_mask,0.878035,0.755488,70.110114,92.801132,116.931604,101.833197,104.316825,125.674182,358446.791667,81.482668,...,13208.125493,55.600107,832176.248523,0.004497,0.000162,0.188931,1733.836805,0.000053,GTV-1_mask,0
LUNG1-002_20180526_CT_1_GTV-1_mask,0.878035,0.755488,70.110114,92.801132,116.931604,101.833197,104.316825,125.674182,358446.791667,81.482668,...,13208.125493,55.600107,832176.248523,0.004497,0.000162,0.188931,1733.836805,0.000053,GTV-1_mask,0
LUNG1-003_000000_GTV-1_mask,0.544631,0.356597,25.559022,71.674815,56.639209,83.528438,62.265560,84.011904,34987.000000,39.036358,...,9142.646956,17.909008,209143.444093,0.002673,0.000367,0.402930,2838.784544,0.000191,GTV-1_mask,0
LUNG1-003_20180209_CT_1_GTV-1_mask,0.544631,0.356597,25.559022,71.674815,56.639209,83.528438,62.265560,84.011904,34987.000000,39.036358,...,9142.646956,17.909008,209143.444093,0.002673,0.000367,0.402930,2838.784544,0.000191,GTV-1_mask,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LUNG1-342_000000_GTV-1_mask,0.974104,0.705922,31.213709,44.216933,57.245087,55.659680,55.659680,60.183054,38986.875000,43.071892,...,8810.961714,10.006351,104482.381761,0.001182,0.000208,0.412647,3056.038922,0.000126,GTV-1_mask,0
LUNG1-343_000000_GTV-1_mask,0.849862,0.612734,40.937354,66.810979,82.377181,79.246451,71.400280,88.600226,100987.458333,56.780113,...,7974.212295,49.051197,402459.929585,0.006028,0.000158,0.124182,838.463963,0.000049,GTV-1_mask,0
LUNG1-345_000000_GTV-1_mask,0.851225,0.478765,28.770221,60.092622,72.801099,64.815122,67.268120,74.155243,58463.250000,51.152343,...,9674.701245,36.001759,394648.688745,0.003500,0.000198,0.259454,1811.133205,0.000108,GTV-1_mask,1
LUNG1-347_000000_GTV-1_mask,0.551685,0.447612,13.158653,29.397432,21.260292,34.481879,24.041631,36.124784,3131.208333,16.218112,...,5185.837607,3.649573,25163.108895,0.001458,0.000878,0.663799,2953.583448,0.000784,GTV-1_mask,1


In [8]:
# excluding patients with unknown outcome (in case they are represented)
fs.handle_nan(axis=1)

In [9]:
# visualization of feature values distribution in classes (in .html report)
fs.plot_binary_distribution(fs._feature_column[:100])

Example of plotted distributions of feature values in classes:
![](./data/images/distr.png)

In [10]:
# visualization of feature mutual (Spearman) correlation coefficient matrix (in .html report)
fs.plot_correlation_matrix(fs._feature_column[:100])

Example of feature correlation matrix:
![](./data/images/corr.png)

In [11]:
# visualization of Mann-Whitney Bonferroni corrected p-values for binary classes test (in .html report)
fs.plot_MW_p(fs._feature_column[:100])

Example of Mann-Whitney p-values:
![](./data/images/p_MW.png)

In [12]:
# visualization of univariate ROC-curves (in .html report)
fs.plot_univariate_roc(fs._feature_column[:100])

Example of univariate feature ROC:
![](./data/images/roc.png)

In [13]:
# calculation of basic statistics for each feature (in .xlsx):
# number of NaN, mean, std, min, max; if applicable: MW-p, univariate ROC AUC, volume correlation
fs.calculate_basic_stats(volume_feature='original_shape_VoxelVolume')

In [None]:
# checking the excel table
print('Basic statistics for each feature')
pd.read_excel('./data/features/features_basic_stats.xlsx')

In [None]:
# volume analysis
fs.volume_analysis(volume_feature='original_shape_VoxelVolume')

Example of volume precision-recall curve:
![](./data/images/vol_prc.png)

Example of volume Spearman correlation coefficients:
![](./data/images/vol_corr.png)