# Predicting lung cancer survival time by OWKIN

### Problem

- supervised survival prediction problem
- predict the survival time of a patient (remaining days to live) from one three-dimensional CT scan (grayscale image) and a set of pre-extracted quantitative imaging features, as well as clinical data

### Import

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import autosklearn.regression
import time

# concordance index (C-index)
from metrics_t9gbvr2 import cindex



### Data

- x_train : data_Q0G7b5t
- y_train : output_VSVxRFU.csv
- x_test : data_9Cbe5hx

In [2]:
data_folder_path = "../data"
training_folder_path = os.path.join(data_folder_path, "data_Q0G7b5t")
test_folder_path = os.path.join(data_folder_path, "data_9Cbe5hx")

training_ct_scan_names = [os.path.join(root,file_name) for root,_,file_names in os.walk(training_folder_path) for file_name in file_names if file_name.endswith('.npz')]
test_ct_scan_names = [os.path.join(root,file_name) for root,_,file_names in os.walk(test_folder_path) for file_name in file_names if file_name.endswith('.npz')]

print("Number of training ct scans : {}".format(len(training_ct_scan_names)))
print("Number of test ct scans : {}".format(len(test_ct_scan_names)))

training_features_path = os.path.join(training_folder_path, "features")
test_features_path = os.path.join(test_folder_path, "features")

submission_file_path = "../random_submission_example"

Number of training ct scans : 300
Number of test ct scans : 125


In [3]:
archive = np.load(training_ct_scan_names[0])
scan = archive['scan']
mask = archive['mask']
# scan.shape equals mask.shape

In [4]:
df_train_output = pd.read_csv(os.path.join(data_folder_path, "output_VSVxRFU.csv"), index_col=0)
p0 = df_train_output.loc[202]
print("p0.Event", p0.Event) # prints 1 or 0
print("p0.SurvivalTime", p0.SurvivalTime)
# prints time to event (time to death or time to last known alive) in days

p0.Event 0
p0.SurvivalTime 1378


In [5]:
df_train_output.sample(5)

Unnamed: 0_level_0,SurvivalTime,Event
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
356,632,1
329,582,0
399,340,1
20,476,1
348,706,1


### Interpretation

(`1=death observed`, `0=escaped from study`)

### Load training data

In [6]:
file_name = os.path.join(training_features_path, "clinical_data.csv")
df_training_clinical_data = pd.read_csv(file_name, delimiter=',')
print("Nb rows in df_training_clinical_data : {}".format(len(df_training_clinical_data)))

file_name = os.path.join(training_features_path, "radiomics.csv")
df_training_radiomics = pd.read_csv(file_name, delimiter=',', skiprows=[0,2], header=[0])
df_training_radiomics.rename(columns={'Unnamed: 0': 'PatientID'}, inplace=True)
print("Nb rows in df_training_radiomics : {}".format(len(df_training_radiomics)))

Nb rows in df_training_clinical_data : 300
Nb rows in df_training_radiomics : 300


### Load test data

In [7]:
file_name = os.path.join(test_features_path, "clinical_data.csv")
df_test_clinical_data = pd.read_csv(file_name, delimiter=',')
print("Nb rows in df_training_clinical_data : {}".format(len(df_test_clinical_data)))

file_name = os.path.join(test_features_path, "radiomics.csv")
df_test_radiomics = pd.read_csv(file_name, delimiter=',', skiprows=[0,2], header=[0])
df_test_radiomics.rename(columns={'Unnamed: 0': 'PatientID'}, inplace=True)
print("Nb rows in df_training_radiomics : {}".format(len(df_test_clinical_data)))

Nb rows in df_training_clinical_data : 125
Nb rows in df_training_radiomics : 125


### clinical_data.csv

In [8]:
df_training_clinical_data.sample(5)

Unnamed: 0,PatientID,Histology,Mstage,Nstage,SourceDataset,Tstage,age
198,400,squamous cell carcinoma,0,3,l1,2,64.1697
45,114,adenocarcinoma,0,3,l1,4,60.0383
186,300,squamous cell carcinoma,0,0,l1,3,77.5606
34,37,squamous cell carcinoma,0,0,l1,2,75.6934
98,151,Adenocarcinoma,0,2,l2,1,70.0


#### Are there NaN values in df_training_clinical_data ?

In [9]:
#df_training_clinical_data.info()
df_training_clinical_data.isnull().sum()

PatientID         0
Histology        20
Mstage            0
Nstage            0
SourceDataset     0
Tstage            0
age              16
dtype: int64

### Remark

There are NaN values in columns Histology and age. We will not use these in our study so no problem.

### radiomics.csv

In [10]:
df_training_radiomics.sample(5)

Unnamed: 0,PatientID,original_shape_Compactness1,original_shape_Compactness2,original_shape_Maximum3DDiameter,original_shape_SphericalDisproportion,original_shape_Sphericity,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_VoxelVolume,original_firstorder_Energy,...,original_glrlm_LongRunEmphasis,original_glrlm_GrayLevelNonUniformity,original_glrlm_RunLengthNonUniformity,original_glrlm_RunPercentage,original_glrlm_LowGrayLevelRunEmphasis,original_glrlm_HighGrayLevelRunEmphasis,original_glrlm_ShortRunLowGrayLevelEmphasis,original_glrlm_ShortRunHighGrayLevelEmphasis,original_glrlm_LongRunLowGrayLevelEmphasis,original_glrlm_LongRunHighGrayLevelEmphasis
96,78,0.030115,0.322239,39.293765,1.458615,0.685582,2303.582475,0.390333,5932.0,1601160000.0,...,1.689786,166.644658,3844.082296,0.848332,0.010899,571.705852,0.009558,528.86098,0.019415,838.419788
279,81,0.037435,0.497925,25.455844,1.261669,0.792601,1209.23814,0.433399,2810.0,220899700.0,...,1.678673,106.311399,1874.480001,0.853162,0.004426,919.55729,0.004263,795.79246,0.005333,1822.23624
160,309,0.024907,0.22041,92.827798,1.655475,0.604056,13612.316052,0.194153,70214.0,553839400.0,...,8.188323,6302.29606,14590.040956,0.501957,0.00086,1535.60171,0.000649,985.002708,0.00526,13333.694878
242,84,0.011653,0.048244,146.208755,2.746952,0.36404,67799.783468,0.185947,364882.0,51010160000.0,...,1.398895,12637.464595,267812.272378,0.89527,0.003177,1205.11898,0.003016,1095.237228,0.003941,1797.967771
33,143,0.027188,0.262628,21.931712,1.561539,0.640394,986.692571,0.660639,1517.0,364078900.0,...,1.129743,38.669687,1345.809488,0.960043,0.009541,631.675999,0.009377,608.128139,0.010233,733.317725


#### Are there NaN values in df_training_radiomics ?

In [11]:
#df_training_radiomics.info()
df_training_radiomics.isnull().sum().sum()

0

### Remark

There are no NaN values in df_training_radiomics.

### Make sure that PatientID are aligned in df_training_clinical_data and df_training_radiomics

In [12]:
(df_training_clinical_data["PatientID"]==df_training_radiomics["PatientID"]).sum()

300

$300$ means that all PatientIDs are aligned in both training dataframes

### Make sure that PatientID are aligned in df_test_clinical_data and df_test_radiomics

In [13]:
(df_test_clinical_data["PatientID"]==df_test_radiomics["PatientID"]).sum()

125

$125$ means that all PatientIDs are aligned in both test dataframes

### Baseline model for survival regression on NSCLC clinical data : Cox proportional hazard (Cox-PH) model

This baseline is trained on a selection of features from both clinical data file and radiomics file. A Cox-PH model was fitted on

- 1 - Tumor sphericity, a measure of the roundness of the shape of the tumor region relative to a sphere, regardless its dimensions (size).
- 2 - The tumor's surface to volume ratio is a measure of the compactness of the tumor, related to its size.
- 3 - The tumor's maximum 3d diameter The biggest diameter measurable from the tumor volume
- 4 - The dataset of origin
- 5 - The N-tumoral stage grading of the tumor describing nearby (regional) lymph nodes involved
- 6 - The tumor's joint entropy, specifying the randomness in the image pixel values
- 7 - The tumor's inverse different, a measure of the local homogeneity of the tumor
- 8 - The tumor's inverse difference moment is another measurement of the local homogeneity of the tumor

### Name of variables

- 1 - original_shape_Sphericity
- 2 - original_shape_SurfaceVolumeRatio
- 3 - original_shape_Maximum3DDiameter
- 4 - l1 (0) or l2 (1)
- 5 - Nstage
- 6 - original_firstorder_Entropy
- 7 - inverse difference (original_glcm_Id)
- 8 - inverse difference moment (original_glcm_Idm) (according to [here](https://static-content.springer.com/esm/art%3A10.1038%2Fncomms5006/MediaObjects/41467_2014_BFncomms5006_MOESM716_ESM.pdf), ctr+F IDMN and [here](https://github.com/cerr/CERR/wiki/GLCM_global_features))

### Remark

Variables used in the baseline use quantitve and qualitive variables. (dataset of origin (l1 or l2)) makes no sens 

In [14]:
df_training_clinical_data.head(5)

Unnamed: 0,PatientID,Histology,Mstage,Nstage,SourceDataset,Tstage,age
0,202,Adenocarcinoma,0,0,l2,2,66.0
1,371,large cell,0,2,l1,4,64.5722
2,246,squamous cell carcinoma,0,3,l1,2,66.0452
3,240,nos,0,2,l1,3,59.3566
4,284,squamous cell carcinoma,0,3,l1,4,71.0554


### Encode SourceDataset ("dataset of origin") with value between 0 and n_datasets-1.

In [15]:
encoder = LabelEncoder()
encoder.fit(df_training_clinical_data["SourceDataset"])
df_training_clinical_data["SourceDataset"] = encoder.transform(df_training_clinical_data["SourceDataset"])

In [16]:
df_training_clinical_data.head(5)

Unnamed: 0,PatientID,Histology,Mstage,Nstage,SourceDataset,Tstage,age
0,202,Adenocarcinoma,0,0,1,2,66.0
1,371,large cell,0,2,0,4,64.5722
2,246,squamous cell carcinoma,0,3,0,2,66.0452
3,240,nos,0,2,0,3,59.3566
4,284,squamous cell carcinoma,0,3,0,4,71.0554


In [17]:
df_X_train = pd.concat([df_training_radiomics[["original_shape_Sphericity", 
                       "original_shape_SurfaceVolumeRatio", 
                       "original_shape_Maximum3DDiameter",
                       "original_firstorder_Entropy",
                       "original_glcm_Id",
                       "original_glcm_Idm"]],
                        df_training_clinical_data[["SourceDataset",
                                                  "Nstage"]]
                       ], sort=False, axis=1)

In [18]:
df_X_train = df_X_train.astype(float)
df_X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 8 columns):
original_shape_Sphericity            300 non-null float64
original_shape_SurfaceVolumeRatio    300 non-null float64
original_shape_Maximum3DDiameter     300 non-null float64
original_firstorder_Entropy          300 non-null float64
original_glcm_Id                     300 non-null float64
original_glcm_Idm                    300 non-null float64
SourceDataset                        300 non-null float64
Nstage                               300 non-null float64
dtypes: float64(8)
memory usage: 18.9 KB


### AutoML sklearn

In [None]:
automl = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=120, # in seconds
                                                     per_run_time_limit=30,
                                                     resampling_strategy='cv',
                                                     resampling_strategy_arguments={'folds': 5})

cindex_scorer = autosklearn.metrics.make_scorer(
        name="cindex",
        score_func=cindex,
        optimum=1,
        greater_is_better=True,
        needs_proba=False,
        needs_threshold=False,
    )

start_time = time.time()
automl.fit(df_X_train.copy(), df_train_output["SurvivalTime"].copy(), dataset_name='data_Q0G7b5t') #, metric=cindex_scorer
# Docs for metric in autoML skearln
# https://automl.github.io/auto-sklearn/master/examples/example_metrics.html
# (Metric must be instance of autosklearn.metrics.Scorer.)

#https://automl.github.io/auto-sklearn/master/examples/example_crossvalidation.html
automl.refit(df_X_train.copy(), df_train_output["SurvivalTime"].copy())

execution_time = time.time()-start_time
print("execution_time", execution_time)

# https://automl.github.io/auto-sklearn/master/manual.html
# Cross validation : https://scikit-learn.org/stable/modules/cross_validation.html



In [None]:
print("{:0.2f}".format(automl.cv_results_["mean_test_score"].max()))
print()
print(automl.sprint_statistics())
#automl._automl[0]._metric.name

In [None]:
len(automl.cv_results_["mean_test_score"])

In [None]:
all_information = automl.get_models_with_weights()
index_regressor = 5
weights = []
print("Models used with corresponding weights :\n")
for weight, simple_regression_pipeline in all_information:
    print(str(weight)+" : "+simple_regression_pipeline[index_regressor].choice.__class__.__name__)
    weights.append(weight)
print()
print("sum(weights) = ", np.round(sum(weights),2))

### Test

In [None]:
df_test_clinical_data["SourceDataset"] = encoder.transform(df_test_clinical_data["SourceDataset"])

In [None]:
df_X_test = pd.concat([df_test_radiomics[["original_shape_Sphericity", 
                       "original_shape_SurfaceVolumeRatio", 
                       "original_shape_Maximum3DDiameter",
                       "original_firstorder_Entropy",
                       "original_glcm_Id",
                       "original_glcm_Idm"]],
                        df_test_clinical_data[["SourceDataset",
                                                  "Nstage"]]
                       ], sort=False, axis=1)

In [None]:
df_X_test = df_X_test.astype(float)
df_X_test.info()

In [None]:
y_hat = automl.predict(df_X_test)

In [None]:
df_predicted_survival_time = pd.read_csv(os.path.join(submission_file_path, "random_submission_0vhlEZN.csv"))
df_predicted_survival_time.sample(5)

In [None]:
df_predicted_survival_time["PatientID"] = df_training_clinical_data["PatientID"]
df_predicted_survival_time["SurvivalTime"] = y_hat

In [None]:
df_predicted_survival_time.sample(5)

## $\color{red}{\text{To be continued}}$