# Predicting lung cancer survival time by OWKIN

### Problem

- supervised survival prediction problem
- predict the survival time of a patient (remaining days to live) from one three-dimensional CT scan (grayscale image) and a set of pre-extracted quantitative imaging features, as well as clinical data

### Import

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import autosklearn.regression
import time

# concordance index (C-index)
from metrics_t9gbvr2 import cindex



### Data

- x_train : data_Q0G7b5t
- y_train : output_VSVxRFU.csv
- x_test : data_9Cbe5hx

In [2]:
data_folder_path = "../data"
training_folder_path = os.path.join(data_folder_path, "data_Q0G7b5t")
test_folder_path = os.path.join(data_folder_path, "data_9Cbe5hx")

training_ct_scan_names = [os.path.join(root,file_name) for root,_,file_names in os.walk(training_folder_path) for file_name in file_names if file_name.endswith('.npz')]
test_ct_scan_names = [os.path.join(root,file_name) for root,_,file_names in os.walk(test_folder_path) for file_name in file_names if file_name.endswith('.npz')]

print("Number of training ct scans : {}".format(len(training_ct_scan_names)))
print("Number of test ct scans : {}".format(len(test_ct_scan_names)))

training_features_path = os.path.join(training_folder_path, "features")
test_features_path = os.path.join(test_folder_path, "features")

submission_file_path = "../random_submission_example"

Number of training ct scans : 300
Number of test ct scans : 125


In [3]:
archive = np.load(training_ct_scan_names[0])
scan = archive['scan']
mask = archive['mask']
# scan.shape equals mask.shape

In [4]:
df_train_output = pd.read_csv(os.path.join(data_folder_path, "output_VSVxRFU.csv"), index_col=0)
p0 = df_train_output.loc[202]
print("p0.Event", p0.Event) # prints 1 or 0
print("p0.SurvivalTime", p0.SurvivalTime)
# prints time to event (time to death or time to last known alive) in days

p0.Event 0
p0.SurvivalTime 1378


In [5]:
df_train_output.sample(5)

Unnamed: 0_level_0,SurvivalTime,Event
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
144,886,1
53,1823,0
395,98,1
298,946,0
88,258,0


### Interpretation

(`1=death observed`, `0=escaped from study`)

### Load training data

In [6]:
file_name = os.path.join(training_features_path, "clinical_data.csv")
df_training_clinical_data = pd.read_csv(file_name, delimiter=',', index_col=0)
print("Nb rows in df_training_clinical_data : {}".format(len(df_training_clinical_data)))

file_name = os.path.join(training_features_path, "radiomics.csv")
df_training_radiomics = pd.read_csv(file_name, delimiter=',', index_col=0, skiprows=[0,2], header=[0])
df_training_radiomics.index.names = ["PatientID"]
print("Nb rows in df_training_radiomics : {}".format(len(df_training_radiomics)))

Nb rows in df_training_clinical_data : 300
Nb rows in df_training_radiomics : 300


### Load test data

In [7]:
file_name = os.path.join(test_features_path, "clinical_data.csv")
df_test_clinical_data = pd.read_csv(file_name, delimiter=',', index_col=0)
print("Nb rows in df_training_clinical_data : {}".format(len(df_test_clinical_data)))

file_name = os.path.join(test_features_path, "radiomics.csv")
df_test_radiomics = pd.read_csv(file_name, delimiter=',', index_col=0 , skiprows=[0,2], header=[0])
df_test_radiomics.index.names = ["PatientID"]
print("Nb rows in df_training_radiomics : {}".format(len(df_test_clinical_data)))

Nb rows in df_training_clinical_data : 125
Nb rows in df_training_radiomics : 125


### clinical_data.csv

In [8]:
df_training_clinical_data.sample(5)

Unnamed: 0_level_0,Histology,Mstage,Nstage,SourceDataset,Tstage,age
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
44,large cell,0,2,l1,2,65.4839
161,adenocarcinoma,0,2,l1,2,72.8761
395,squamous cell carcinoma,0,2,l1,2,68.961
92,nos,0,0,l1,1,
98,Squamous cell carcinoma,0,0,l2,2,76.0


#### Are there NaN values in df_training_clinical_data ?

In [9]:
#df_training_clinical_data.info()
df_training_clinical_data.isnull().sum()

Histology        20
Mstage            0
Nstage            0
SourceDataset     0
Tstage            0
age              16
dtype: int64

### Remark

There are NaN values in columns Histology and age. We will not use these in our study so no problem.

### radiomics.csv

In [10]:
df_training_radiomics.sample(5)

Unnamed: 0_level_0,original_shape_Compactness1,original_shape_Compactness2,original_shape_Maximum3DDiameter,original_shape_SphericalDisproportion,original_shape_Sphericity,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_VoxelVolume,original_firstorder_Energy,original_firstorder_Entropy,...,original_glrlm_LongRunEmphasis,original_glrlm_GrayLevelNonUniformity,original_glrlm_RunLengthNonUniformity,original_glrlm_RunPercentage,original_glrlm_LowGrayLevelRunEmphasis,original_glrlm_HighGrayLevelRunEmphasis,original_glrlm_ShortRunLowGrayLevelEmphasis,original_glrlm_ShortRunHighGrayLevelEmphasis,original_glrlm_LongRunLowGrayLevelEmphasis,original_glrlm_LongRunHighGrayLevelEmphasis
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
388,0.019524,0.135442,103.662915,1.947223,0.513552,24266.180921,0.185502,130944.0,914510800.0,2.288307,...,10.806165,12987.367683,22951.086138,0.457447,0.000902,1355.148132,0.000647,824.253098,0.007511,15975.433093
328,0.020752,0.153009,216.716405,1.869652,0.534859,26888.44202,0.1658,162335.0,1341964000.0,2.86776,...,5.846542,14899.517216,40995.2263,0.561617,0.000859,1419.364224,0.000646,958.993661,0.004144,8619.884295
333,0.013253,0.062408,75.239617,2.521084,0.396655,14028.401913,0.359421,39159.0,9224645000.0,5.118503,...,1.375428,1325.03134,29446.188457,0.901864,0.018416,1028.98616,0.016965,931.208848,0.025921,1576.572563
285,0.021479,0.163915,66.075714,1.827234,0.547275,10004.410334,0.262616,38180.0,5887096000.0,5.412531,...,1.248744,913.298357,30907.064249,0.92895,0.002256,1121.729938,0.002157,1052.402657,0.002703,1458.701611
409,0.036326,0.468863,23.937418,1.287215,0.776871,1216.541399,0.445287,2753.0,1283925000.0,4.686618,...,1.304326,113.755377,2136.355685,0.915225,0.011836,259.345076,0.010974,245.062735,0.016049,325.511805


#### Are there NaN values in df_training_radiomics ?

In [11]:
#df_training_radiomics.info()
df_training_radiomics.isnull().sum().sum()

0

### Remark

There are no NaN values in df_training_radiomics.

### Make sure that PatientID are aligned in df_training_clinical_data and df_training_radiomics

In [12]:
(df_training_clinical_data.index.values==df_training_radiomics.index.values).sum()

300

$300$ means that all PatientIDs are aligned in both training dataframes

### Make sure that PatientID are aligned in df_test_clinical_data and df_test_radiomics

In [13]:
(df_test_clinical_data.index.values==df_test_radiomics.index.values).sum()

125

$125$ means that all PatientIDs are aligned in both test dataframes

### Baseline model for survival regression on NSCLC clinical data : Cox proportional hazard (Cox-PH) model

This baseline is trained on a selection of features from both clinical data file and radiomics file. A Cox-PH model was fitted on

- 1 - Tumor sphericity, a measure of the roundness of the shape of the tumor region relative to a sphere, regardless its dimensions (size).
- 2 - The tumor's surface to volume ratio is a measure of the compactness of the tumor, related to its size.
- 3 - The tumor's maximum 3d diameter The biggest diameter measurable from the tumor volume
- 4 - The dataset of origin
- 5 - The N-tumoral stage grading of the tumor describing nearby (regional) lymph nodes involved
- 6 - The tumor's joint entropy, specifying the randomness in the image pixel values
- 7 - The tumor's inverse different, a measure of the local homogeneity of the tumor
- 8 - The tumor's inverse difference moment is another measurement of the local homogeneity of the tumor

### Name of variables

- 1 - original_shape_Sphericity
- 2 - original_shape_SurfaceVolumeRatio
- 3 - original_shape_Maximum3DDiameter
- 4 - l1 (0) or l2 (1)
- 5 - Nstage
- 6 - original_firstorder_Entropy
- 7 - inverse difference (original_glcm_Id)
- 8 - inverse difference moment (original_glcm_Idm) (according to [here](https://static-content.springer.com/esm/art%3A10.1038%2Fncomms5006/MediaObjects/41467_2014_BFncomms5006_MOESM716_ESM.pdf), ctr+F IDMN and [here](https://github.com/cerr/CERR/wiki/GLCM_global_features))

### Remark

Variables used in the baseline use quantitve and qualitive variables. (dataset of origin (l1 or l2)) makes no sens 

In [14]:
df_training_clinical_data.head(5)

Unnamed: 0_level_0,Histology,Mstage,Nstage,SourceDataset,Tstage,age
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
202,Adenocarcinoma,0,0,l2,2,66.0
371,large cell,0,2,l1,4,64.5722
246,squamous cell carcinoma,0,3,l1,2,66.0452
240,nos,0,2,l1,3,59.3566
284,squamous cell carcinoma,0,3,l1,4,71.0554


### Encode SourceDataset ("dataset of origin") with value between 0 and n_datasets-1.

In [15]:
encoder = LabelEncoder()
encoder.fit(df_training_clinical_data["SourceDataset"])
df_training_clinical_data["SourceDataset"] = encoder.transform(df_training_clinical_data["SourceDataset"])

In [16]:
df_training_clinical_data.head(5)

Unnamed: 0_level_0,Histology,Mstage,Nstage,SourceDataset,Tstage,age
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
202,Adenocarcinoma,0,0,1,2,66.0
371,large cell,0,2,0,4,64.5722
246,squamous cell carcinoma,0,3,0,2,66.0452
240,nos,0,2,0,3,59.3566
284,squamous cell carcinoma,0,3,0,4,71.0554


In [17]:
df_X_train = pd.concat([df_training_radiomics[["original_shape_Sphericity", 
                       "original_shape_SurfaceVolumeRatio", 
                       "original_shape_Maximum3DDiameter",
                       "original_firstorder_Entropy",
                       "original_glcm_Id",
                       "original_glcm_Idm"]],
                        df_training_clinical_data[["SourceDataset",
                                                  "Nstage"]]
                       ], sort=False, axis=1)

In [18]:
df_X_train = df_X_train.astype(float)
df_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 202 to 366
Data columns (total 8 columns):
original_shape_Sphericity            300 non-null float64
original_shape_SurfaceVolumeRatio    300 non-null float64
original_shape_Maximum3DDiameter     300 non-null float64
original_firstorder_Entropy          300 non-null float64
original_glcm_Id                     300 non-null float64
original_glcm_Idm                    300 non-null float64
SourceDataset                        300 non-null float64
Nstage                               300 non-null float64
dtypes: float64(8)
memory usage: 21.1 KB


### AutoML sklearn

In [19]:
data = {"PatientID" :  df_train_output.index.values,
        "SurvivalTime" : df_train_output["SurvivalTime"].values,
        "Event" :  df_train_output["Event"].values
       }
temp = pd.DataFrame(data)
temp = temp.set_index(["PatientID", "Event"])
temp["SurvivalTime"]

PatientID  Event
202        0        1378
371        1         379
246        1         573
240        0         959
284        0        2119
                    ... 
261        0        1540
298        0         946
129        0         559
273        0        1952
366        0         858
Name: SurvivalTime, Length: 300, dtype: int64

In [20]:
automl = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=60, # in seconds
                                                     per_run_time_limit=30,
                                                     resampling_strategy='cv',
                                                     resampling_strategy_arguments={'folds': 5})

def my_cindex(solution, prediction, patient_ids, events):
    prediction = prediction.ravel()
       
    print("solution.shape", solution.shape)
    print("prediction.shape", prediction.shape)
    print("patient_ids.shape", patient_ids.shape)
    print("events.shape", events.shape)
    
    #print(autosklearn.metrics.r2(solution, prediction))
    #return autosklearn.metrics.r2(solution, prediction)

    data_solution = {"PatientID" :  patient_ids,
                     "SurvivalTime" : solution,
                     "Event" : events
                    }
    df_solution = pd.DataFrame(data_solution)
    df_solution = df_solution.set_index("PatientID")
    
    data_prediction = {"PatientID" :  patient_ids,
                       "SurvivalTime" : prediction,
                       "Event" : events*np.nan
                      }
    df_prediction = pd.DataFrame(data_prediction)
    df_prediction = df_prediction.set_index("PatientID")
    
    print(cindex(df_solution, df_prediction))
    return cindex(df_solution, df_prediction)

cindex_scorer = autosklearn.metrics.make_scorer(
        name="cindex",
        score_func=my_cindex,
        optimum=1,
        greater_is_better=True,
        needs_proba=False,
        needs_threshold=False,
        patient_ids = df_train_output.index.values,
        events = df_train_output["Event"].values      
    )

start_time = time.time()
automl.fit(df_X_train.copy(), df_train_output["SurvivalTime"].copy(), dataset_name='data_Q0G7b5t', metric=cindex_scorer) #, metric=cindex_scorer
# Docs for metric in autoML skearln
# https://automl.github.io/auto-sklearn/master/examples/example_metrics.html
# (Metric must be instance of autosklearn.metrics.Scorer.)

#https://automl.github.io/auto-sklearn/master/examples/example_crossvalidation.html
automl.refit(df_X_train.copy(), df_train_output["SurvivalTime"].copy())

execution_time = time.time()-start_time
print("execution_time", execution_time)

# https://automl.github.io/auto-sklearn/master/manual.html
# Cross validation : https://scikit-learn.org/stable/modules/cross_validation.html

solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.49417370760401463
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.49417370760401463
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6352460657507567
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.9028410528977149
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6352460657507567
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6352460657507567
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.7446216147808324
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution

events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6525847915746396
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6525447483279331
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.652624834821346
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6525447483279331
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shap

prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6535458294955938
solution.shape (300,)
patient_ids.shape (300,)
prediction.shape (300,)
events.shape (300,)
0.6537060024824195
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6525447483279331
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6530252672884103
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6537860889758323
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6524246185878139
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6528650943015846
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6535057862488873
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6525047050812267
solution.shape (300,)
prediction

0.6533456132620616
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6537060024824195
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6522644456009882
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533055700153553
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.653746045729126
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6522244023542817
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6532655267686488
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6538261322225388
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6521443158608689
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.653185440

patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6527850078081717
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6529852240417038
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6532655267686488
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6525847915746396
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533055700153553
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6530653105351166
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_id

solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6521843591075753
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533055700153553
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.653185440275236
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6536659592357131
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6522644456009882
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533055700153553
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533456132620616
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6535057862488873
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.652344532094401
solution.sha

events.shape (300,)
0.6529852240417038
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6537060024824195
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.652344532094401
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533055700153553
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6529852240417038
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6536259159890067
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6522644456009882
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6532655267686488
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6531453970285296
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shap

prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6219917510909294
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6534256997554745
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6528650943015846
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6535458294955938
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6520241861207495
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.62411404316637
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6537060024824195
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6525447483279331
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6530252672884103
solution.shape (300,)
prediction.s

0.6528250510548781
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.639971168862115
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6537860889758323
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6521843591075753
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533055700153553
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.653185440275236
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6405718175627115
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6536659592357131
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6522644456009882
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533055700

patient_ids.shape (300,)
events.shape (300,)
0.6529852240417038
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6443358827531156
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6535858727423002
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.652344532094401
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.653465743002181
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6530252672884103
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6445360989866478
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6535057862488873
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6522644456009882
solution.shape (300,)
prediction.shape (300,)
patient_ids.

solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6560685540380987
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6578705001398879
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6515837104069789
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6104993392861848
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6605533976692185
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6597525327350899
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6574300244261172
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6587514515674293
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6551876026105573
solution.s

events.shape (300,)
0.6593921435147322
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6578705001398879
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6404116445758857
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6605533976692185
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.659472230008145
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6583910623470715
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6595122732548514
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6573899811794108
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6413326392501336
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shap

patient_ids.shape (300,)
events.shape (300,)
0.6492612020980061
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6604332679290993
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6606334841626313
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.659472230008145
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6605133544225121
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.65863132182731
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6499419372920153
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6604733111758057
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6602330516955671
solution.shape (300,)
prediction.shape (300,)
patient_ids.s

prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6596324029949707
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.660032835462035
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6591118407877872
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6524646618345202
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6606334841626313
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6601129219554478
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6591919272812
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6599127057219157
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6585912785806036
solution.shape (300,)
prediction.sha

0.6623553437710077
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6634765546787876
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
0.657309894685998
events.shape (300,)
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6521443158608689
0.6593921435147322
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6472189965159784
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6110999879867812
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6631562087051363
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6631562087051363
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.660833700

patient_ids.shape (300,)
events.shape (300,)
0.6630360789650169
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6615544788368792
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6392904336681058
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6641973331195034
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6641973331195034
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6637568574057326
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.662755776238072
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6631161654584299
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6628358627314849
solution.shape (300,)
prediction.shape (300,)
patient_ids

solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6637969006524391
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6629960357183106
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6637568574057326
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6639971168859712
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.647098866775859
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6638369438991455
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6640371601326777
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.663876987145852
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.663596684418907
solution.shap

events.shape (300,)
0.6533456132620616
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.664077203379384
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.664157289872797
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6640371601326777
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6634765546787876
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6640371601326777
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6636367276656134
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6533856565087681
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape (300,)
0.6639971168859712
solution.shape (300,)
prediction.shape (300,)
patient_ids.shape (300,)
events.shape

In [21]:
print("{:0.2f}".format(automl.cv_results_["mean_test_score"].max()))
print()
print(automl.sprint_statistics())
#automl._automl[0]._metric.name

0.66

auto-sklearn results:
  Dataset name: data_Q0G7b5t
  Metric: cindex
  Best validation score: 0.657450
  Number of target algorithm runs: 9
  Number of successful target algorithm runs: 8
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0



In [22]:
len(automl.cv_results_["mean_test_score"])

9

In [23]:
all_information = automl.get_models_with_weights()
index_regressor = 5
weights = []
print("Models used with corresponding weights :\n")
for weight, simple_regression_pipeline in all_information:
    print(str(weight)+" : "+simple_regression_pipeline[index_regressor].choice.__class__.__name__)
    weights.append(weight)
print()
print("sum(weights) = ", np.round(sum(weights),2))

Models used with corresponding weights :

0.62 : XGradientBoostingRegressor
0.3 : ExtraTreesRegressor
0.06 : RandomForest
0.02 : DecisionTree

sum(weights) =  1.0


### Test

In [24]:
df_test_clinical_data["SourceDataset"] = encoder.transform(df_test_clinical_data["SourceDataset"])

In [25]:
df_X_test = pd.concat([df_test_radiomics[["original_shape_Sphericity", 
                       "original_shape_SurfaceVolumeRatio", 
                       "original_shape_Maximum3DDiameter",
                       "original_firstorder_Entropy",
                       "original_glcm_Id",
                       "original_glcm_Idm"]],
                        df_test_clinical_data[["SourceDataset",
                                                  "Nstage"]]
                       ], sort=False, axis=1)

In [26]:
df_X_test = df_X_test.astype(float)
df_X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125 entries, 13 to 274
Data columns (total 8 columns):
original_shape_Sphericity            125 non-null float64
original_shape_SurfaceVolumeRatio    125 non-null float64
original_shape_Maximum3DDiameter     125 non-null float64
original_firstorder_Entropy          125 non-null float64
original_glcm_Id                     125 non-null float64
original_glcm_Idm                    125 non-null float64
SourceDataset                        125 non-null float64
Nstage                               125 non-null float64
dtypes: float64(8)
memory usage: 8.8 KB


In [27]:
y_hat = automl.predict(df_X_test)

In [28]:
df_predicted_survival_time = pd.read_csv(os.path.join(submission_file_path, "random_submission_0vhlEZN.csv"), index_col=0)
df_predicted_survival_time.head(5)

Unnamed: 0_level_0,SurvivalTime,Event
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
13,788.417673,
155,427.650092,
404,173.587222,
407,389.877973,
9,1580.767244,


In [29]:
df_predicted_survival_time.index = df_test_clinical_data.index
df_predicted_survival_time["SurvivalTime"] = y_hat

In [30]:
df_predicted_survival_time.head(5)

Unnamed: 0_level_0,SurvivalTime,Event
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
13,432.096143,
155,786.511313,
404,566.460237,
407,639.784369,
9,1222.117695,


## $\color{red}{\text{To be continued}}$

### Annexe (just to check detail on training data)

In [31]:
y_hat = automl.predict(df_X_train)

df_hat = df_train_output.copy()
df_hat["SurvivalTime"] = y_hat
df_hat["Event"] = np.nan
cindex(df_train_output , df_hat)

0.7405720070865791