# Examples of feature set class usage:

In [3]:
from features_set import features_set
import pandas as pd

In order to demonstrate the functionality of the class we generated the binary outcomes for open source survival data for Lung1 NSCLC dataset.
'Features.xlsx' file contains radiomics features, extracted from Lung1 NSCLC dataset (https://wiki.cancerimagingarchive.net/display/Public/NSCLC-Radiomics). Clinical data for this dataset is available via the same link.

In [22]:
# The code to generate the binary outcomes for 1, 1.5 and 2 year boundaries
# loading file with Lung1 clinical data
clinical_df = pd.read_csv('./data/features/NSCLC Radiomics Lung1.clinical-version3-Oct 2019.csv')
# adding boolean 1-,1.5-,2-year survival variables
for i in [1,1.5,2]:
    clinical_df['%syearsurvival'%i] = clinical_df['Survival.time'] > 365*i
    clinical_df.loc[(clinical_df['Survival.time'] <= 365*i)&(clinical_df['deadstatus.event']==0), '%syearsurvival'%i] = None
# save the extended clinical data
clinical_df.to_excel('./data/features/extended_clinical_df.xlsx')

### Initialization of the feature class

In [31]:
# set up the parameters
parameters = {
    'feature_path': './data/features/features.xlsx', # path to csv/xls file with features
    'outcome_path': './data/features/extended_clinical_df.xlsx', #path to csv/xls file with outcome
    'patient_column': 'Patient', # name of column with patient id
    'patient_in_outcome_column': 'PatientID', # name of column with patient id in clinical data file
    'outcome_column': '1yearsurvival' # name of outcome column
}

# initialize feature set
rad_features = features_set(**parameters)
# excluding patients with unknown outcome (in case they are represented)
rad_features.handle_nan(axis=1)

In [32]:
#display feature dataframe
rad_features._feature_outcome_dataframe.head(5)

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,wavelet-LLL_gldm_HighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceEmphasis,wavelet-LLL_gldm_LargeDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,wavelet-LLL_gldm_LowGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceEmphasis,wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis,ROI,1yearsurvival
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUNG1-001_20180209_CT_2_GTV-1_mask,0.732658,0.548834,46.151744,84.090588,95.336247,83.186537,95.425364,96.10411,155379.5,61.609664,...,14462.536758,33.609142,548084.071741,0.002759,0.000139,0.267761,2959.571494,5.8e-05,GTV-1_mask,1
LUNG1-002_000000_GTV-1_mask,0.878035,0.755488,70.110114,92.801132,116.931604,101.833197,104.316825,125.674182,358446.791667,81.482668,...,13208.125493,55.600107,832176.248523,0.004497,0.000162,0.188931,1733.836805,5.3e-05,GTV-1_mask,0
LUNG1-002_20180526_CT_1_GTV-1_mask,0.878035,0.755488,70.110114,92.801132,116.931604,101.833197,104.316825,125.674182,358446.791667,81.482668,...,13208.125493,55.600107,832176.248523,0.004497,0.000162,0.188931,1733.836805,5.3e-05,GTV-1_mask,0
LUNG1-003_000000_GTV-1_mask,0.544631,0.356597,25.559022,71.674815,56.639209,83.528438,62.26556,84.011904,34987.0,39.036358,...,9142.646956,17.909008,209143.444093,0.002673,0.000367,0.40293,2838.784544,0.000191,GTV-1_mask,0
LUNG1-003_20180209_CT_1_GTV-1_mask,0.544631,0.356597,25.559022,71.674815,56.639209,83.528438,62.26556,84.011904,34987.0,39.036358,...,9142.646956,17.909008,209143.444093,0.002673,0.000367,0.40293,2838.784544,0.000191,GTV-1_mask,0


### Visualization of features distribution
Visualization functions are made to be interactive. Each visualization function will generate a separate html file and open it in your browser window. You can also render them by pressing on <b>"download plot as a png"</b> button on the control panel in right top corner.

In [36]:
# visualization of feature values distribution in classes (in .html report)
rad_features.plot_binary_distribution(fs._feature_column[:100])

Example of plotted distributions of feature values in classes:
 
<img src="./data/images/distr.png" width="1000" />

### Visualization of features correlation

In [34]:
# visualization of feature mutual (Spearman) correlation coefficient matrix (in .html report)
rad_features.plot_correlation_matrix(fs._feature_column[:100])

Example of feature correlation matrix:
<img src="./data/images/corr.png" width="800" />

### Mann-Whitney Bonferroni corrected p-values test (binary classes)

In [35]:
# visualization of Mann-Whitney Bonferroni corrected p-values for binary classes test (in .html report)
rad_features.plot_MW_p(fs._feature_column[:100])

Example of Mann-Whitney p-values:
<img src="./data/images/p_MW.png" width="800" />

### Univariate ROC-curves

In [37]:
# visualization of univariate ROC-curves (in .html report)
rad_features.plot_univariate_roc(fs._feature_column[:100])

Example of univariate feature ROC:
![](./data/images/roc.png)

### Features statistics & correlation with volume

In [39]:
# calculation of basic statistics for each feature (in .xlsx):
# number of NaN, mean, std, min, max; if applicable: MW-p, univariate ROC AUC, volume correlation
rad_features.calculate_basic_stats(volume_feature='original_shape_VoxelVolume')
display(pd.read_excel('./data/features/features_basic_stats.xlsx').head())

Unnamed: 0.1,Unnamed: 0,NaN,Mean,Std,Min,Max,p_MW_corrected,univar_auc,volume_corr
0,original_shape_Elongation,0,0.720328,0.161721,0.062127,0.974104,1.0,0.517996,0.037515
1,original_shape_Flatness,0,0.559677,0.154895,0.047315,0.856767,1.0,0.516611,0.099032
2,original_shape_LeastAxisLength,0,32.055804,15.98362,6.643777,85.49566,0.117975,0.686877,0.973238
3,original_shape_MajorAxisLength,0,61.595666,35.336125,13.611433,240.822486,0.018917,0.705795,0.842114
4,original_shape_Maximum2DDiameterColumn,0,63.064956,33.057474,15.620499,157.632484,0.161563,0.681525,0.950909


### Analyze the volume alone performance 

In [42]:
# volume analysis
rad_features.volume_analysis(volume_feature='original_shape_VoxelVolume')


<div id="the whole thing" style="height:100%; width:100%; overflow: hidden;">
    <div id="leftThing" style="float: left; width:40%;"> Example of volume precision-recall curve:
        <img src="./data/images/vol_prc.png" width="500" />
    </div>
    <div id="leftThing" style="float: left; width:60%;"> Example of volume Spearman correlation coefficients:
        <img src="./data/images/vol_corr.png" width="500" />
    </div>
</div>