### Todo

- Add text, titles, sections etc.
- Add reference to paper and IRM image for context
- A bit more data exploration
- Handle categorical attribute
- Mention at the end: 
    - Feature selection (e.g. MRMR)
    - Model selection (e.g. pycaret)
    - Model finetuning (e.g. optuna)

In [57]:
# Installing the repository in Google Colab

# # Disable cell output
# %%capture

# # Download the repository from github
# !git clone https://github.com/5TuX/ml-radiomics-classification.git

# # Set working directory
# %cd ml-radiomics-classification

# # Install dependencies with uv
# !curl -LsSf https://astral.sh/uv/install.sh | sh
# import os
# os.environ["PATH"] = f"/root/.cargo/bin:{os.environ['PATH']}"
# !uv sync

In [58]:
# Imports

import polars as pl
from itertools import groupby
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Notebook settings:

In [59]:
settings = {
    "dataset-path": "data/Radiomics_binWidth-15_ZScore_NETnNCR_T1CE.csv",
}

Load the data:

In [60]:
df = pl.read_csv(settings["dataset-path"])
print("Dataset shape:", df.shape)

Dataset shape: (369, 1720)


Look at a few examples:

In [61]:
df.head()

Patient_ID,Group,Group_label,binWidth,Normalization,Age,Survival_days,Extent_of_Resection,Subregion,Sequence,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,diagnostics_Image-original_Hash,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Spacing,diagnostics_Image-original_Size,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_Hash,diagnostics_Mask-original_Spacing,diagnostics_Mask-original_Size,diagnostics_Mask-original_BoundingBox,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,diagnostics_Mask-original_CenterOfMassIndex,diagnostics_Mask-original_CenterOfMass,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,…,wavelet-LLL_glrlm_GrayLevelNonUniformity,wavelet-LLL_glrlm_GrayLevelNonUniformityNormalized,wavelet-LLL_glrlm_GrayLevelVariance,wavelet-LLL_glrlm_HighGrayLevelRunEmphasis,wavelet-LLL_glrlm_LongRunEmphasis,wavelet-LLL_glrlm_LongRunHighGrayLevelEmphasis,wavelet-LLL_glrlm_LongRunLowGrayLevelEmphasis,wavelet-LLL_glrlm_LowGrayLevelRunEmphasis,wavelet-LLL_glrlm_RunEntropy,wavelet-LLL_glrlm_RunLengthNonUniformity,wavelet-LLL_glrlm_RunLengthNonUniformityNormalized,wavelet-LLL_glrlm_RunPercentage,wavelet-LLL_glrlm_RunVariance,wavelet-LLL_glrlm_ShortRunEmphasis,wavelet-LLL_glrlm_ShortRunHighGrayLevelEmphasis,wavelet-LLL_glrlm_ShortRunLowGrayLevelEmphasis,wavelet-LLL_glszm_GrayLevelNonUniformity,wavelet-LLL_glszm_GrayLevelNonUniformityNormalized,wavelet-LLL_glszm_GrayLevelVariance,wavelet-LLL_glszm_HighGrayLevelZoneEmphasis,wavelet-LLL_glszm_LargeAreaEmphasis,wavelet-LLL_glszm_LargeAreaHighGrayLevelEmphasis,wavelet-LLL_glszm_LargeAreaLowGrayLevelEmphasis,wavelet-LLL_glszm_LowGrayLevelZoneEmphasis,wavelet-LLL_glszm_SizeZoneNonUniformity,wavelet-LLL_glszm_SizeZoneNonUniformityNormalized,wavelet-LLL_glszm_SmallAreaEmphasis,wavelet-LLL_glszm_SmallAreaHighGrayLevelEmphasis,wavelet-LLL_glszm_SmallAreaLowGrayLevelEmphasis,wavelet-LLL_glszm_ZoneEntropy,wavelet-LLL_glszm_ZonePercentage,wavelet-LLL_glszm_ZoneVariance,wavelet-LLL_ngtdm_Busyness,wavelet-LLL_ngtdm_Coarseness,wavelet-LLL_ngtdm_Complexity,wavelet-LLL_ngtdm_Contrast,wavelet-LLL_ngtdm_Strength
str,str,i64,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,f64,f64,str,str,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""BraTS20_Training_001""","""HGG""",1,"""binWidth-15""","""ZScore""",60.463,"""289""","""GTR""","""NETnNCR""","""T1CE""","""v3.0.1""","""1.20.3""","""2.1.1""","""1.1.1""","""3.9.7""","""{'minimumROIDimensions': 2, 'm…","""{'Original': {}, 'Exponential'…","""fcf31c6f56b4067eb28299303a0674…","""3D""","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""",-6.4336e-16,-0.404772,11.492377,"""f3599f6c7ce9538e47d18beaef7292…","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""","""(70, 102, 41, 50, 56, 37)""",15443.0,34.0,"""(89.23389237842387, 122.326749…","""(89.23389237842387, 122.326749…",0.731829,0.41979,21.139285,50.356792,50.358713,…,2837.692308,1.0,0.0,1.0,56.108317,56.108317,56.108317,1.0,3.683174,314.441224,0.108017,0.183753,23.691856,0.28482,0.28482,0.28482,34.0,1.0,0.0,1.0,6384000.0,6384000.0,6384000.0,1.0,5.470588,0.1609,0.379267,0.379267,0.379267,3.133984,0.002202,6177700.0,0.0,1000000.0,0.0,0.0,0.0
"""BraTS20_Training_002""","""HGG""",1,"""binWidth-15""","""ZScore""",52.263,"""616""","""GTR""","""NETnNCR""","""T1CE""","""v3.0.1""","""1.20.3""","""2.1.1""","""1.1.1""","""3.9.7""","""{'minimumROIDimensions': 2, 'm…","""{'Original': {}, 'Exponential'…","""58a926796869be2fcbfafa87f1e2d3…","""3D""","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""",7.7498e-16,-0.438949,8.906679,"""f6ef775f5d12edcb4e8926782d7773…","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""","""(67, 86, 37, 28, 32, 34)""",9160.0,9.0,"""(77.61626637554585, 101.946397…","""(77.61626637554585, 101.946397…",0.805201,0.596898,17.919111,30.020387,34.132096,…,931.230769,1.0,0.0,1.0,148.738015,148.738015,148.738015,1.0,4.063811,65.188726,0.068125,0.101663,37.847387,0.148596,0.148596,0.148596,9.0,1.0,0.0,1.0,9144900.0,9144900.0,9144900.0,1.0,1.0,0.111111,0.155282,0.155282,0.155282,3.169925,0.000983,8109000.0,0.0,1000000.0,0.0,0.0,0.0
"""BraTS20_Training_003""","""HGG""",1,"""binWidth-15""","""ZScore""",54.301,"""464""","""GTR""","""NETnNCR""","""T1CE""","""v3.0.1""","""1.20.3""","""2.1.1""","""1.1.1""","""3.9.7""","""{'minimumROIDimensions': 2, 'm…","""{'Original': {}, 'Exponential'…","""923aabc1a6c23ad6accc67bcd26fe1…","""3D""","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""",2.7571e-16,-0.389427,11.927897,"""27f9116fc7420ab3c6eedf267830db…","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""","""(160, 149, 61, 16, 19, 18)""",733.0,7.0,"""(168.64529331514325, 158.99317…","""(168.64529331514325, 158.99317…",0.768372,0.722447,11.550058,15.987407,18.681542,…,291.153846,1.0,0.0,1.0,10.77577,10.77577,10.77577,1.0,2.387787,77.714144,0.259495,0.397209,3.934461,0.500213,0.500213,0.500213,7.0,1.0,0.0,1.0,62367.571429,62367.571429,62367.571429,1.0,1.285714,0.183673,0.298127,0.298127,0.298127,2.521641,0.00955,51402.489796,0.0,1000000.0,0.0,0.0,0.0
"""BraTS20_Training_004""","""HGG""",1,"""binWidth-15""","""ZScore""",39.068,"""788""","""GTR""","""NETnNCR""","""T1CE""","""v3.0.1""","""1.20.3""","""2.1.1""","""1.1.1""","""3.9.7""","""{'minimumROIDimensions': 2, 'm…","""{'Original': {}, 'Exponential'…","""55fa7fba043c7c44a6e842b5a586a4…","""3D""","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""",-5.9757e-16,-0.438407,11.005426,"""07742fbe85b3d48e3aa5ff6ad53873…","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""","""(149, 150, 64, 31, 44, 44)""",10902.0,37.0,"""(162.29728490185286, 169.57576…","""(162.29728490185286, 169.57576…",0.880563,0.556199,21.603687,38.841633,42.720019,…,2986.769231,1.0,0.0,1.0,27.455009,27.455009,27.455009,1.0,3.051311,557.843589,0.182311,0.273965,13.085883,0.40605,0.40605,0.40605,37.0,1.0,0.0,1.0,3022800.0,3022800.0,3022800.0,1.0,5.594595,0.151205,0.360428,0.360428,0.360428,3.230669,0.003394,2936000.0,0.0,1000000.0,0.0,0.0,0.0
"""BraTS20_Training_005""","""HGG""",1,"""binWidth-15""","""ZScore""",68.493,"""465""","""GTR""","""NETnNCR""","""T1CE""","""v3.0.1""","""1.20.3""","""2.1.1""","""1.1.1""","""3.9.7""","""{'minimumROIDimensions': 2, 'm…","""{'Original': {}, 'Exponential'…","""6c94ee5878e0aca66cb21e828c50fe…","""3D""","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""",-4.8643e-17,-0.416155,12.239193,"""5fec5dd1de3d85905a292c12f948d4…","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""","""(123, 157, 84, 52, 34, 42)""",3624.0,37.0,"""(155.71136865342163, 172.22599…","""(155.71136865342163, 172.22599…",0.342747,0.309231,20.04694,64.828365,60.60528,…,1608.791873,0.985289,0.007356,1.022234,8.446223,8.469023,8.440523,0.994442,2.250301,508.396876,0.305674,0.450458,3.256455,0.552665,0.574758,0.547142,30.148936,0.641467,0.179267,1.702128,168949.234043,168950.12766,168949.010638,0.824468,17.851064,0.37981,0.634417,1.288673,0.470853,2.825271,0.012969,163003.839746,2.292387,0.220302,0.002505,1.2e-05,0.153538


Look at basic statistics:

In [62]:
df.describe()

statistic,Patient_ID,Group,Group_label,binWidth,Normalization,Age,Survival_days,Extent_of_Resection,Subregion,Sequence,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,diagnostics_Image-original_Hash,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Spacing,diagnostics_Image-original_Size,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_Hash,diagnostics_Mask-original_Spacing,diagnostics_Mask-original_Size,diagnostics_Mask-original_BoundingBox,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,diagnostics_Mask-original_CenterOfMassIndex,diagnostics_Mask-original_CenterOfMass,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,…,wavelet-LLL_glrlm_GrayLevelNonUniformity,wavelet-LLL_glrlm_GrayLevelNonUniformityNormalized,wavelet-LLL_glrlm_GrayLevelVariance,wavelet-LLL_glrlm_HighGrayLevelRunEmphasis,wavelet-LLL_glrlm_LongRunEmphasis,wavelet-LLL_glrlm_LongRunHighGrayLevelEmphasis,wavelet-LLL_glrlm_LongRunLowGrayLevelEmphasis,wavelet-LLL_glrlm_LowGrayLevelRunEmphasis,wavelet-LLL_glrlm_RunEntropy,wavelet-LLL_glrlm_RunLengthNonUniformity,wavelet-LLL_glrlm_RunLengthNonUniformityNormalized,wavelet-LLL_glrlm_RunPercentage,wavelet-LLL_glrlm_RunVariance,wavelet-LLL_glrlm_ShortRunEmphasis,wavelet-LLL_glrlm_ShortRunHighGrayLevelEmphasis,wavelet-LLL_glrlm_ShortRunLowGrayLevelEmphasis,wavelet-LLL_glszm_GrayLevelNonUniformity,wavelet-LLL_glszm_GrayLevelNonUniformityNormalized,wavelet-LLL_glszm_GrayLevelVariance,wavelet-LLL_glszm_HighGrayLevelZoneEmphasis,wavelet-LLL_glszm_LargeAreaEmphasis,wavelet-LLL_glszm_LargeAreaHighGrayLevelEmphasis,wavelet-LLL_glszm_LargeAreaLowGrayLevelEmphasis,wavelet-LLL_glszm_LowGrayLevelZoneEmphasis,wavelet-LLL_glszm_SizeZoneNonUniformity,wavelet-LLL_glszm_SizeZoneNonUniformityNormalized,wavelet-LLL_glszm_SmallAreaEmphasis,wavelet-LLL_glszm_SmallAreaHighGrayLevelEmphasis,wavelet-LLL_glszm_SmallAreaLowGrayLevelEmphasis,wavelet-LLL_glszm_ZoneEntropy,wavelet-LLL_glszm_ZonePercentage,wavelet-LLL_glszm_ZoneVariance,wavelet-LLL_ngtdm_Busyness,wavelet-LLL_ngtdm_Coarseness,wavelet-LLL_ngtdm_Complexity,wavelet-LLL_ngtdm_Contrast,wavelet-LLL_ngtdm_Strength
str,str,str,f64,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,f64,f64,str,str,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""369""","""369""",369.0,"""369""","""369""",236.0,"""236""","""129""","""369""","""369""","""368""","""368""","""368""","""368""","""368""","""368""","""368""","""368""","""368""","""368""","""368""",368.0,368.0,368.0,"""368""","""368""","""368""","""368""",368.0,368.0,"""368""","""368""",368.0,368.0,368.0,368.0,…,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0
"""null_count""","""0""","""0""",0.0,"""0""","""0""",133.0,"""133""","""240""","""0""","""0""","""1""","""1""","""1""","""1""","""1""","""1""","""1""","""1""","""1""","""1""","""1""",1.0,1.0,1.0,"""1""","""1""","""1""","""1""",1.0,1.0,"""1""","""1""",1.0,1.0,1.0,1.0,…,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"""mean""",,,0.794038,,,61.223203,,,,,,,,,,,,,,,,-1.3152e-17,-0.419098,10.345853,,,,,22179.336957,33.103261,,,0.723042,0.555079,24.485618,45.730192,…,2963.677357,0.994174,0.002913,1.352574,101.552119,169.249656,84.627828,0.911909,3.333423,422.079383,0.182812,0.256798,38.762551,0.368986,0.469273,0.343959,33.113254,0.945609,0.030401,1.31665,376380000.0,488460000.0,348360000.0,0.924942,9.287271,0.324657,0.428875,0.594118,0.38953,2.38682,0.009033,105550000.0,9.729446,760869.714043,0.0013,0.000169,0.100075
"""std""",,,0.404952,,,11.874114,,,,,,,,,,,,,,,,4.3709e-16,0.024681,2.671905,,,,,29878.14703,35.848002,,,0.155188,0.147826,10.167512,19.301596,…,2666.229821,0.037481,0.018741,0.954343,144.981048,351.244154,128.949013,0.23844,1.007408,403.022144,0.133684,0.167152,49.280637,0.171601,0.335728,0.190336,35.204095,0.129886,0.082513,0.822908,2086400000.0,2318300000.0,2074500000.0,0.190537,10.806416,0.216648,0.185001,0.522212,0.186838,0.913765,0.022325,541600000.0,120.791894,427133.013751,0.010525,0.002248,0.280222
"""min""","""BraTS20_Training_001""","""HGG""",0.0,"""binWidth-15""","""ZScore""",18.975,"""1020""","""GTR""","""NETnNCR""","""T1CE""","""v3.0.1""","""1.20.3""","""2.1.1""","""1.1.1""","""3.9.7""","""{'minimumROIDimensions': 2, 'm…","""{'Original': {}, 'Exponential'…","""00d17d31c60689868bffb781d60587…","""3D""","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""",-1.0822e-15,-0.50596,5.281236,"""0025e0bf9a63fc731cbd4286d5090c…","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""","""(100, 122, 78, 57, 54, 40)""",47.0,1.0,"""(100.07977976897334, 145.51592…","""(100.07977976897334, 145.51592…",0.222498,0.124907,3.738339,7.287142,…,24.230769,0.51188,0.0,1.0,1.401858,1.401858,1.011264,0.249277,0.463787,10.058603,0.025891,0.042804,0.16028,0.049052,0.049052,0.021336,1.0,0.333333,0.0,1.0,46.769231,46.769231,46.769231,0.252768,1.0,0.080332,4.0048e-11,4.0048e-11,4.0048e-11,-3.2034e-16,6e-06,0.0,0.0,0.000213,0.0,0.0,0.0
"""25%""",,,1.0,,,54.279,,,,,,,,,,,,,,,,-3.4967e-16,-0.434824,8.488869,,,,,3916.0,8.0,,,0.634073,0.464006,17.272473,33.002982,…,1073.153846,1.0,0.0,1.0,17.366738,17.480936,16.36676,1.0,2.708348,139.166628,0.090103,0.13737,7.674577,0.248304,0.279907,0.189201,8.285714,1.0,0.0,1.0,461810.015625,502626.625,442701.042373,1.0,2.142857,0.194471,0.337449,0.349895,0.284439,2.034941,0.000739,202644.637755,0.0,1000000.0,0.0,0.0,0.0
"""50%""",,,1.0,,,61.526,,,,,,,,,,,,,,,,-2.6079e-17,-0.419837,9.997395,,,,,10590.0,22.0,,,0.745096,0.575729,23.508105,43.981132,…,2347.154719,1.0,0.0,1.0,43.578971,46.046788,37.590433,1.0,3.366516,310.531785,0.15146,0.216342,20.198637,0.362393,0.393413,0.341103,22.0,1.0,0.0,1.0,4253200.0,4609100.0,3884900.0,1.0,5.625,0.273288,0.471941,0.5,0.420159,2.535858,0.00305,2097300.0,0.0,1000000.0,0.0,0.0,0.0
"""75%""",,,1.0,,,69.178,,,,,,,,,,,,,,,,3.1513e-16,-0.404016,12.003048,,,,,25394.0,46.0,,,0.841307,0.662864,30.729417,55.860815,…,4221.153846,1.0,0.0,1.0,112.832558,139.451056,98.480431,1.0,4.035562,568.203129,0.232698,0.327519,49.737264,0.472026,0.540567,0.469102,44.166667,1.0,0.0,1.0,29597000.0,40328000.0,27477000.0,1.0,12.322034,0.346939,0.553092,0.610795,0.530467,2.977079,0.007737,15137000.0,0.0,1000000.0,0.0,0.0,0.0
"""max""","""BraTS20_Training_369""","""LGG""",1.0,"""binWidth-15""","""ZScore""",86.652,"""ALIVE (361 days later)""","""STR""","""NETnNCR""","""T1CE""","""v3.0.1""","""1.20.3""","""2.1.1""","""1.1.1""","""3.9.7""","""{'minimumROIDimensions': 2, 'm…","""{'Original': {}, 'Exponential'…","""feca6e10bcc8b3b7a9097190e4cc2c…","""3D""","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""",1.0307e-15,-0.331704,35.951043,"""fdf0ff3c0badd2e8a8a0881244d27f…","""(1.0, 1.0, 1.0)""","""(240, 240, 155)""","""(99, 84, 72, 44, 51, 50)""",189152.0,270.0,"""(99.95001372906317, 88.9456828…","""(99.95001372906317, 88.9456828…",0.984177,0.878147,59.587263,171.979417,…,16845.846154,1.0,0.24406,4.058931,879.823903,2593.162303,879.823903,1.0,5.492313,3019.874758,0.835281,0.898687,299.155169,0.929974,2.792085,0.929974,269.00738,1.0,0.734375,6.375,24970000000.0,24970000000.0,24970000000.0,1.0,75.723247,1.0,0.805556,3.087144,0.805556,4.384792,0.216667,7950500000.0,1937.165966,1000000.0,0.163813,0.037715,1.833333


Explore structure of column names:

In [63]:
print(f"{len(df.columns)} columns in total.\n")


def key(colname):
    return colname.split("_")[0]


for column_group, columns in groupby(df.columns, key):
    print(column_group)
    columns = list(columns)
    if len(columns) == 1:
        print(f"\t-> {columns}")
        continue
    columns = [colname[len(column_group) + 1 :] for colname in columns]
    for column_subgroup, subcolumns in groupby(columns, key):
        maxlen, suffix = 3, ""
        subcolumns = [colname[len(column_subgroup) + 1 :] for colname in subcolumns]
        if (lensubcols := len(subcolumns)) > maxlen:
            subcolumns = subcolumns[:maxlen]
            suffix = f"... ({lensubcols} total)"
        print(f"\t{column_subgroup} -> {subcolumns} {suffix}")

1720 columns in total.

Patient
	-> ['Patient_ID']
Group
	 -> [''] 
	label -> [''] 
binWidth
	-> ['binWidth']
Normalization
	-> ['Normalization']
Age
	-> ['Age']
Survival
	-> ['Survival_days']
Extent
	-> ['Extent_of_Resection']
Subregion
	-> ['Subregion']
Sequence
	-> ['Sequence']
diagnostics
	Versions -> ['PyRadiomics', 'Numpy', 'SimpleITK'] ... (5 total)
	Configuration -> ['Settings', 'EnabledImageTypes'] 
	Image-original -> ['Hash', 'Dimensionality', 'Spacing'] ... (7 total)
	Mask-original -> ['Hash', 'Spacing', 'Size'] ... (8 total)
original
	shape -> ['Elongation', 'Flatness', 'LeastAxisLength'] ... (14 total)
	firstorder -> ['10Percentile', '90Percentile', 'Energy'] ... (18 total)
	glcm -> ['Autocorrelation', 'ClusterProminence', 'ClusterShade'] ... (24 total)
	gldm -> ['DependenceEntropy', 'DependenceNonUniformity', 'DependenceNonUniformityNormalized'] ... (14 total)
	glrlm -> ['GrayLevelNonUniformity', 'GrayLevelNonUniformityNormalized', 'GrayLevelVariance'] ... (16 total)
	gls

Select columns:

In [64]:
# Exclude metadata columns

df = df.select(
    pl.all()
    .exclude(
        [
            "Patient_ID",
            "Group_label",
            "binWidth",
            "Normalization",
            "Subregion",
            "Sequence",
        ]
    )
    .exclude("^diagnostics_.*$")
)

# Select inputs and targets columns

# Target variables: ignore Survival_days (too many missing values)
targets = ["Group"]
# Clinical inputs: ignore Age and Extent_of_Resection (too many missing values):
inputs_clinical = []
# Radiomic inputs: keep only basic radiomic features and ignore the rest:
inputs_radiomics = [col for col in df.columns if col.startswith("original_")]
df = df.select(targets + inputs_clinical + inputs_radiomics).rename(
    {col: col[9:] for col in inputs_radiomics}
)

Handle missing values:

In [65]:
df = df.drop_nulls()
print("Dataset shape after dropping nulls:", df.shape)

Dataset shape after dropping nulls: (368, 108)


Look again at a few examples and basic statistics:

In [66]:
df.head()

Group,shape_Elongation,shape_Flatness,shape_LeastAxisLength,shape_MajorAxisLength,shape_Maximum2DDiameterColumn,shape_Maximum2DDiameterRow,shape_Maximum2DDiameterSlice,shape_Maximum3DDiameter,shape_MeshVolume,shape_MinorAxisLength,shape_Sphericity,shape_SurfaceArea,shape_SurfaceVolumeRatio,shape_VoxelVolume,firstorder_10Percentile,firstorder_90Percentile,firstorder_Energy,firstorder_Entropy,firstorder_InterquartileRange,firstorder_Kurtosis,firstorder_Maximum,firstorder_MeanAbsoluteDeviation,firstorder_Mean,firstorder_Median,firstorder_Minimum,firstorder_Range,firstorder_RobustMeanAbsoluteDeviation,firstorder_RootMeanSquared,firstorder_Skewness,firstorder_TotalEnergy,firstorder_Uniformity,firstorder_Variance,glcm_Autocorrelation,glcm_ClusterProminence,glcm_ClusterShade,glcm_ClusterTendency,…,glrlm_GrayLevelNonUniformity,glrlm_GrayLevelNonUniformityNormalized,glrlm_GrayLevelVariance,glrlm_HighGrayLevelRunEmphasis,glrlm_LongRunEmphasis,glrlm_LongRunHighGrayLevelEmphasis,glrlm_LongRunLowGrayLevelEmphasis,glrlm_LowGrayLevelRunEmphasis,glrlm_RunEntropy,glrlm_RunLengthNonUniformity,glrlm_RunLengthNonUniformityNormalized,glrlm_RunPercentage,glrlm_RunVariance,glrlm_ShortRunEmphasis,glrlm_ShortRunHighGrayLevelEmphasis,glrlm_ShortRunLowGrayLevelEmphasis,glszm_GrayLevelNonUniformity,glszm_GrayLevelNonUniformityNormalized,glszm_GrayLevelVariance,glszm_HighGrayLevelZoneEmphasis,glszm_LargeAreaEmphasis,glszm_LargeAreaHighGrayLevelEmphasis,glszm_LargeAreaLowGrayLevelEmphasis,glszm_LowGrayLevelZoneEmphasis,glszm_SizeZoneNonUniformity,glszm_SizeZoneNonUniformityNormalized,glszm_SmallAreaEmphasis,glszm_SmallAreaHighGrayLevelEmphasis,glszm_SmallAreaLowGrayLevelEmphasis,glszm_ZoneEntropy,glszm_ZonePercentage,glszm_ZoneVariance,ngtdm_Busyness,ngtdm_Coarseness,ngtdm_Complexity,ngtdm_Contrast,ngtdm_Strength
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""HGG""",0.731829,0.41979,21.139285,50.356792,50.358713,59.539903,50.990195,63.85922,15226.333333,36.852581,0.333812,8899.775397,0.584499,15443.0,0.891341,2.406696,43805.647206,0.000994,0.76735,3.905945,4.857057,0.476079,1.57019,1.471689,-0.037217,4.894274,0.322622,1.684222,0.845454,43805.647206,0.99987,0.371107,3.999682,0.000159,-0.000159,0.000159,…,2837.693033,0.999275,0.000362,3.998912,56.052892,224.210478,14.013495,0.250272,3.685716,315.192629,0.108203,0.183882,23.686554,0.285304,1.140127,0.071598,33.057143,0.94449,0.027755,3.914286,6200800.0,24803000.0,1550200.0,0.271429,5.971429,0.170612,0.397002,1.502295,0.120679,3.231618,0.002266,6006100.0,0.300826,0.831125,0.000156,9.2377e-09,0.90777
"""HGG""",0.805201,0.596898,17.919111,30.020387,34.132096,34.525353,31.144823,35.496479,9073.541667,24.172434,0.73062,2879.455924,0.317346,9160.0,0.511066,1.20927,7040.180407,-3.2034e-16,0.263257,10.595436,3.429786,0.233776,0.801736,0.711371,0.013167,3.416619,0.120232,0.876686,2.380337,7040.180407,1.0,0.125799,1.0,0.0,0.0,0.0,…,931.230769,1.0,0.0,1.0,148.738015,148.738015,148.738015,1.0,4.063811,65.188726,0.068125,0.101663,37.847387,0.148596,0.148596,0.148596,9.0,1.0,0.0,1.0,9144900.0,9144900.0,9144900.0,1.0,1.0,0.111111,0.155282,0.155282,0.155282,3.169925,0.000983,8109000.0,0.0,1000000.0,0.0,0.0,0.0
"""HGG""",0.768372,0.722447,11.550058,15.987407,18.681542,18.384776,18.681542,22.135944,684.083333,12.284284,0.406249,924.196735,1.351,733.0,1.441948,3.355092,4630.563368,-3.2034e-16,1.090258,2.45011,4.360985,0.592579,2.407835,2.420584,0.713811,3.647174,0.43292,2.513419,0.055125,4630.563368,1.0,0.519605,1.0,0.0,0.0,0.0,…,291.153846,1.0,0.0,1.0,10.77577,10.77577,10.77577,1.0,2.387787,77.714144,0.259495,0.397209,3.934461,0.500213,0.500213,0.500213,7.0,1.0,0.0,1.0,62367.571429,62367.571429,62367.571429,1.0,1.285714,0.183673,0.298127,0.298127,0.298127,2.521641,0.00955,51402.489796,0.0,1000000.0,0.0,0.0,0.0
"""HGG""",0.880563,0.556199,21.603687,38.841633,42.720019,46.238512,44.384682,46.914816,10608.041667,34.202508,0.246069,9488.192833,0.894434,10902.0,0.862828,2.445411,32015.142395,-3.2034e-16,0.977685,2.574108,4.196802,0.524396,1.598139,1.523996,0.342334,3.854469,0.404497,1.71366,0.501066,32015.142395,1.0,0.382583,1.0,0.0,0.0,0.0,…,2986.769231,1.0,0.0,1.0,27.455009,27.455009,27.455009,1.0,3.051311,557.843589,0.182311,0.273965,13.085883,0.40605,0.40605,0.40605,37.0,1.0,0.0,1.0,3022800.0,3022800.0,3022800.0,1.0,5.594595,0.151205,0.360428,0.360428,0.360428,3.230669,0.003394,2936000.0,0.0,1000000.0,0.0,0.0,0.0
"""HGG""",0.342747,0.309231,20.04694,64.828365,60.60528,32.280025,48.754487,61.43289,3209.833333,22.219752,0.201071,5233.470765,1.630449,3624.0,1.890351,3.694297,29902.191747,-3.2034e-16,0.940189,3.291029,5.82697,0.555445,2.785681,2.769665,0.639021,5.187949,0.391193,2.872483,0.463672,29902.191747,1.0,0.491141,1.0,0.0,0.0,0.0,…,1629.384615,1.0,0.0,1.0,8.473932,8.473932,8.473932,1.0,2.199209,505.603602,0.304522,0.449609,3.261904,0.551107,0.551107,0.551107,37.0,1.0,0.0,1.0,216977.945946,216977.945946,216977.945946,1.0,12.027027,0.325055,0.582357,0.582357,0.582357,2.332735,0.01021,207384.537619,0.0,1000000.0,0.0,0.0,0.0


In [67]:
df.describe()

statistic,Group,shape_Elongation,shape_Flatness,shape_LeastAxisLength,shape_MajorAxisLength,shape_Maximum2DDiameterColumn,shape_Maximum2DDiameterRow,shape_Maximum2DDiameterSlice,shape_Maximum3DDiameter,shape_MeshVolume,shape_MinorAxisLength,shape_Sphericity,shape_SurfaceArea,shape_SurfaceVolumeRatio,shape_VoxelVolume,firstorder_10Percentile,firstorder_90Percentile,firstorder_Energy,firstorder_Entropy,firstorder_InterquartileRange,firstorder_Kurtosis,firstorder_Maximum,firstorder_MeanAbsoluteDeviation,firstorder_Mean,firstorder_Median,firstorder_Minimum,firstorder_Range,firstorder_RobustMeanAbsoluteDeviation,firstorder_RootMeanSquared,firstorder_Skewness,firstorder_TotalEnergy,firstorder_Uniformity,firstorder_Variance,glcm_Autocorrelation,glcm_ClusterProminence,glcm_ClusterShade,…,glrlm_GrayLevelNonUniformity,glrlm_GrayLevelNonUniformityNormalized,glrlm_GrayLevelVariance,glrlm_HighGrayLevelRunEmphasis,glrlm_LongRunEmphasis,glrlm_LongRunHighGrayLevelEmphasis,glrlm_LongRunLowGrayLevelEmphasis,glrlm_LowGrayLevelRunEmphasis,glrlm_RunEntropy,glrlm_RunLengthNonUniformity,glrlm_RunLengthNonUniformityNormalized,glrlm_RunPercentage,glrlm_RunVariance,glrlm_ShortRunEmphasis,glrlm_ShortRunHighGrayLevelEmphasis,glrlm_ShortRunLowGrayLevelEmphasis,glszm_GrayLevelNonUniformity,glszm_GrayLevelNonUniformityNormalized,glszm_GrayLevelVariance,glszm_HighGrayLevelZoneEmphasis,glszm_LargeAreaEmphasis,glszm_LargeAreaHighGrayLevelEmphasis,glszm_LargeAreaLowGrayLevelEmphasis,glszm_LowGrayLevelZoneEmphasis,glszm_SizeZoneNonUniformity,glszm_SizeZoneNonUniformityNormalized,glszm_SmallAreaEmphasis,glszm_SmallAreaHighGrayLevelEmphasis,glszm_SmallAreaLowGrayLevelEmphasis,glszm_ZoneEntropy,glszm_ZonePercentage,glszm_ZoneVariance,ngtdm_Busyness,ngtdm_Coarseness,ngtdm_Complexity,ngtdm_Contrast,ngtdm_Strength
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""368""",368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,…,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0,368.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,0.723042,0.555079,24.485618,45.730192,45.057,51.053797,49.333276,56.808197,21862.7851,32.004908,0.376242,9237.61258,0.923609,22179.336957,1.374584,2.522678,74696.113733,0.007873,0.626598,3.996735,4.162478,0.36554,1.920441,1.882498,0.571002,3.591476,0.261605,1.985013,0.372539,74696.113733,0.996985,0.23679,1.530893,0.006383,-0.003187,…,2971.81061,0.994031,0.002985,1.526069,100.816023,186.329359,79.43769,0.868483,3.329075,428.802215,0.183117,0.2571,38.653377,0.369848,0.53148,0.32944,34.180723,0.959979,0.02001,1.353266,336950000.0,505500000.0,294810000.0,0.911684,9.541138,0.32483,0.43707,0.604474,0.395219,2.36478,0.008932,111430000.0,14.646119,826087.014728,0.0014,0.000218,0.055786
"""std""",,0.155188,0.147826,10.167512,19.301596,16.080009,20.17874,19.738679,21.27765,29860.458179,12.507605,0.155833,8357.631322,0.910963,29878.14703,0.524901,0.51373,101634.496284,0.067677,0.276941,3.48061,1.281748,0.143954,0.490192,0.526195,0.592212,1.55996,0.111509,0.474185,0.726591,101634.496284,0.032312,0.185438,1.140018,0.065065,0.030067,…,2678.237799,0.03868,0.01934,1.1305,143.823868,380.737352,123.451225,0.282625,1.00734,424.904352,0.133404,0.166938,49.160277,0.170914,0.427127,0.195481,35.946525,0.10927,0.054635,0.870037,1894300000.0,2763300000.0,1831100000.0,0.217509,11.072411,0.214116,0.182395,0.521023,0.192639,0.912152,0.022245,586740000.0,181.567308,379550.608458,0.013492,0.002967,0.199405
"""min""","""HGG""",0.222498,0.124907,3.738339,7.287142,8.062258,7.28011,5.385165,9.0,22.083333,5.415523,0.096556,68.848149,0.131759,47.0,-0.190334,0.736837,370.774559,-3.2034e-16,0.153756,1.752027,1.681907,0.093379,0.203953,0.105943,-0.454909,0.528899,0.064838,0.431033,-3.62028,370.774559,0.534089,0.01326,1.0,0.0,-0.472291,…,24.230769,0.50779,0.0,1.0,1.401858,1.401858,0.70266,0.250107,0.463787,10.058603,0.025888,0.042804,0.16028,0.049052,0.049052,0.021305,1.0,0.5,0.0,1.0,46.769231,46.769231,46.769231,0.255952,1.0,0.080332,4.0048e-11,4.0048e-11,4.0048e-11,-3.2034e-16,6e-06,0.0,0.0,0.000141,0.0,0.0,0.0
"""25%""",,0.634073,0.464006,17.272473,33.002982,33.615473,36.796739,35.171011,42.296572,3599.625,22.601235,0.255762,3351.436318,0.417864,3916.0,0.979455,2.190171,15821.058967,-3.2034e-16,0.434014,2.627947,3.155506,0.264655,1.598257,1.526641,0.12344,2.401487,0.185134,1.684222,-0.086409,15821.058967,1.0,0.118225,1.0,0.0,0.0,…,1073.153846,1.0,0.0,1.0,17.366738,17.68425,13.858298,1.0,2.709055,139.281601,0.09154,0.137531,7.674577,0.249261,0.285863,0.162384,9.0,1.0,0.0,1.0,461810.015625,518611.833333,358783.870588,1.0,2.166667,0.195924,0.346738,0.354169,0.277291,2.0,0.000764,206626.107085,0.0,1000000.0,0.0,0.0,0.0
"""50%""",,0.745096,0.575729,23.508105,43.981132,44.407207,50.219518,49.406477,56.471232,10340.5,31.362378,0.34617,7331.645871,0.695517,10590.0,1.327203,2.508456,41385.619211,-3.2034e-16,0.586345,3.262533,3.778643,0.346395,1.901583,1.847197,0.505102,3.35169,0.242905,1.964811,0.387561,41385.619211,1.0,0.190868,1.0,0.0,0.0,…,2348.384615,1.0,0.0,1.0,43.578971,52.818404,33.627387,1.0,3.366516,310.531785,0.153595,0.216342,20.198637,0.364618,0.412441,0.32097,22.0,1.0,0.0,1.0,4333200.0,4902100.0,3042000.0,1.0,5.769231,0.27388,0.478282,0.501914,0.432565,2.508817,0.002867,2111300.0,0.0,1000000.0,0.0,0.0,0.0
"""75%""",,0.841307,0.662864,30.729417,55.860815,55.713553,63.906181,61.188234,69.318107,25147.291667,39.560643,0.469189,12960.865504,1.102541,25394.0,1.757149,2.800596,90886.79294,-3.2034e-16,0.789395,4.172358,5.021085,0.461592,2.238754,2.250661,0.973464,4.641944,0.327086,2.290388,0.778114,90886.79294,1.0,0.312538,1.0,0.0,0.0,…,4221.153846,1.0,0.0,1.0,112.962502,171.722879,91.251669,1.0,4.035562,568.203129,0.232698,0.327519,49.737264,0.47274,0.609252,0.462379,46.0,1.0,0.0,1.0,30101000.0,41876000.0,20333272.0,1.0,12.742857,0.34375,0.559436,0.597117,0.535245,2.947703,0.00765,15384000.0,0.0,1000000.0,0.0,0.0,0.0
"""max""","""LGG""",0.984177,0.878147,59.587263,171.979417,105.475116,113.216607,117.885538,132.07195,188726.0,70.539774,0.831783,52815.73818,9.842839,189152.0,3.048159,4.654783,874135.428433,0.950246,2.081183,50.929231,8.818044,1.103804,3.364545,3.396505,2.232331,9.234909,0.839303,3.372971,4.427729,874135.428433,1.0,1.726145,4.0,0.914482,0.0,…,16845.846154,1.0,0.246105,3.99957,879.823903,2928.225613,879.823903,1.0,5.492323,3024.76271,0.835281,0.898687,299.155169,0.929974,2.972293,0.929974,267.116364,1.0,0.25,3.97619,24970000000.0,38177000000.0,24970000000.0,1.0,78.549091,1.0,0.857143,2.667181,0.857143,4.328883,0.216667,7950500000.0,3093.259142,1000000.0,0.199553,0.046437,1.236522


Train/test split:

In [68]:
print("Split the data in Train and Test subsets.")

training_data, test_data = train_test_split(
    df,
    test_size=0.20,
    stratify=df["Group"],
    random_state=42,
)


def check_class_proportions(df: pl.DataFrame):
    summary = (
        df.group_by("Group")
        .agg(pl.len().alias("count"))
        .with_columns((pl.col("count") / pl.col("count").sum()).alias("proportion"))
        .sort("count", descending=True)
    )
    return summary


print("\nCheck class proportions in training dataset:")
print(check_class_proportions(training_data))

print("\nCheck class proportions in test dataset:")
print(check_class_proportions(test_data))

Split the data in Train and Test subsets.

Check class proportions in training dataset:
shape: (2, 3)
┌───────┬───────┬────────────┐
│ Group ┆ count ┆ proportion │
│ ---   ┆ ---   ┆ ---        │
│ str   ┆ u32   ┆ f64        │
╞═══════╪═══════╪════════════╡
│ HGG   ┆ 233   ┆ 0.792517   │
│ LGG   ┆ 61    ┆ 0.207483   │
└───────┴───────┴────────────┘

Check class proportions in test dataset:
shape: (2, 3)
┌───────┬───────┬────────────┐
│ Group ┆ count ┆ proportion │
│ ---   ┆ ---   ┆ ---        │
│ str   ┆ u32   ┆ f64        │
╞═══════╪═══════╪════════════╡
│ HGG   ┆ 59    ┆ 0.797297   │
│ LGG   ┆ 15    ┆ 0.202703   │
└───────┴───────┴────────────┘


Feature scaling (use training data only):

In [None]:
std_scaler = StandardScaler()

training_data_numerical = training_data.select(pl.col(pl.Float64))
training_data_numerical_scaled = std_scaler.fit_transform(training_data_numerical)
training_data_numerical_scaled = pl.DataFrame(
    training_data_numerical_scaled, schema=training_data_numerical.columns
)

max_features_to_plot = 3
for feature in training_data_numerical_scaled.columns[:max_features_to_plot]:
    training_data_numerical_scaled[feature].plot.hist().show()