In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Summary: This dataset comprises features extracted from multiple magnetic resonance imaging (MRI) contrasts for a group of patients suffering from a specific type of brain tumor called Glioblastoma. The objective of this study is to identify imaging biomarkers that could inform on different aspects of the disease such as specific mutation status of the tumor (otherwise only accessible using an invasive biopsy) or overall survival of these patients. The features have already been extracted from the images and refer to ‘radiomic features’ characterising the image in the form of 145 characteristics of how the tumor appears on the image of a particular contrast. Such characteristics refer for example to the intensity distribution of the tumor image (mean/max/min, shape of the distribution etc) and are provided for a variety of imaging contrast such as T1, T2, FLAIR, and diffusion imaging.

In [3]:
!git clone https://github.com/A33ana/FDSG1Brainiacs.git

Cloning into 'FDSG1Brainiacs'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), 20.71 MiB | 4.27 MiB/s, done.


In [4]:
!unzip /content/FDSG1Brainiacs/Project_UPENN.zip

Archive:  /content/FDSG1Brainiacs/Project_UPENN.zip
   creating: Project_UPENN/
  inflating: Project_UPENN/.DS_Store  
  inflating: __MACOSX/Project_UPENN/._.DS_Store  
  inflating: Project_UPENN/UPENN-GBM_CaPTk_fe_params.csv  
  inflating: __MACOSX/Project_UPENN/._UPENN-GBM_CaPTk_fe_params.csv  
  inflating: Project_UPENN/clinFeatures_UPENN.csv  
  inflating: __MACOSX/Project_UPENN/._clinFeatures_UPENN.csv  
  inflating: Project_UPENN/radFeatures_UPENN.csv  
  inflating: __MACOSX/Project_UPENN/._radFeatures_UPENN.csv  
  inflating: Project_UPENN/~$ReadMe.docx  
  inflating: __MACOSX/Project_UPENN/._~$ReadMe.docx  
  inflating: Project_UPENN/ReadMe.pdf  
  inflating: __MACOSX/Project_UPENN/._ReadMe.pdf  


In [8]:
data = pd.read_csv("/content/Project_UPENN/clinFeatures_UPENN.csv")
rad_features = pd.read_csv("/content/Project_UPENN/radFeatures_UPENN.csv")
fe_params = pd.read_csv("/content/Project_UPENN/UPENN-GBM_CaPTk_fe_params.csv")

#Analyzing Data

In [17]:
display(data.head()) #seems like they are the same data
#display(rad_features.head())
print(data.shape)
#print(rad_features.shape)
print(fe_params.shape)
display(fe_params)
print(data.columns)
for i in fe_params["Comments"]:
  print(i)

Unnamed: 0,SubjectID,FLAIR_ED_Intensity_CoefficientOfVariation,FLAIR_ED_Intensity_Energy,FLAIR_ED_Intensity_InterQuartileRange,FLAIR_ED_Intensity_Kurtosis,FLAIR_ED_Intensity_Maximum,FLAIR_ED_Intensity_Mean,FLAIR_ED_Intensity_MeanAbsoluteDeviation,FLAIR_ED_Intensity_Median,FLAIR_ED_Intensity_MedianAbsoluteDeviation,...,DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeMean,DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeNonUniformity,DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeNoneUniformityNormalized,DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeVariance,DSC_PH_ED_NGTDM_Busyness,DSC_PH_ED_NGTDM_Coarsness,DSC_PH_ED_NGTDM_Complexity,DSC_PH_ED_NGTDM_Contrast,DSC_PH_ED_NGTDM_Strength,DSC_PH_ED_LBP_Radius-1_Bins-16_LBP
0,UPENN-GBM-00001_11,0.132367,496350786,28,2.614272,223,150.365238,-7.726173e-13,150,0.365238,...,59.930556,59.666667,0.165741,118680.7,159.674492,0.0001,544.462199,0.334864,0.012338,199744.8
1,UPENN-GBM-00002_11,0.183761,5059094750,50,2.191867,255,177.564146,-7.61875e-12,178,-0.435854,...,98.927342,273.911408,0.174577,1499080.0,1015.313449,1.3e-05,545.886998,0.197933,0.002102,1245890.0
2,UPENN-GBM-00003_11,0.178541,1285339782,48,2.011637,236,160.387047,-2.585516e-12,164,-3.612953,...,62.805447,153.547341,0.199153,690288.7,446.205555,3.2e-05,528.910262,0.104093,0.009445,401497.2
3,UPENN-GBM-00004_11,0.17811,604801952,36,2.666624,221,133.854148,7.141096e-13,135,-1.145852,...,46.08169,137.661972,0.19389,77845.22,257.02345,7e-05,527.780251,0.632939,0.006835,292316.8
4,UPENN-GBM-00005_11,0.161688,957903597,29,2.177048,171,117.705824,-8.040465e-13,116,1.705824,...,123.629358,103.957798,0.190748,522599.2,399.58187,2.4e-05,690.501983,0.801415,0.002235,503819.4


(611, 4753)
(28, 6)


Unnamed: 0,FeatureName,ParamName,Type,Range,Default,Comments
0,Generic,Quantization_Extent,String,Image:ROI,ROI,Whether the quantization of Intensities is sup...
1,Generic,Quantization_Type,String,FixedBinNumber:FixedBinSize:Equal,FixedBinNumber,FixedBinNumber (FBN): the bins are uniformly d...
2,Generic,Resampling,mm,0:10,1.0,Resamples all images and masks to this value o...
3,Generic,ResamplingInterpolator_Image,mm,Nearest:Linear:BSpline,Linear,Type of interpolator to use if resampling is h...
4,Generic,ResamplingInterpolator_Mask,mm,Nearest:NearestLabel:Linear:BSpline,Nearest,Type of interpolator to use if resampling is h...
5,Generic,SliceComputation,Int,(0:1),0,Controls whether non-Intensity features are ca...
6,Generic,NaN-Handling,String,Keep:Remove,Keep,Specify how to handle features with NaN values...
7,Intensity,,,,,
8,Morphologic,Range,Int,(0:25),0,0:largest connected component in ROI; N: all c...
9,Morphologic,Feret,Int,(0:1),0,Whether or not to calculate the Feret Diameter...


Index(['SubjectID', 'FLAIR_ED_Intensity_CoefficientOfVariation',
       'FLAIR_ED_Intensity_Energy', 'FLAIR_ED_Intensity_InterQuartileRange',
       'FLAIR_ED_Intensity_Kurtosis', 'FLAIR_ED_Intensity_Maximum',
       'FLAIR_ED_Intensity_Mean', 'FLAIR_ED_Intensity_MeanAbsoluteDeviation',
       'FLAIR_ED_Intensity_Median',
       'FLAIR_ED_Intensity_MedianAbsoluteDeviation',
       ...
       'DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeMean',
       'DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeNonUniformity',
       'DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeNoneUniformityNormalized',
       'DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeVariance',
       'DSC_PH_ED_NGTDM_Busyness', 'DSC_PH_ED_NGTDM_Coarsness',
       'DSC_PH_ED_NGTDM_Complexity', 'DSC_PH_ED_NGTDM_Contrast',
       'DSC_PH_ED_NGTDM_Strength', 'DSC_PH_ED_LBP_Radius-1_Bins-16_LBP'],
      dtype='object', length=4753)
Whether the quantization of Intensities is supposed to happen on a per-image basis or on a per-ROI basis
FixedBinNumb

In [7]:
display(data.describe())

Unnamed: 0,FLAIR_ED_Intensity_CoefficientOfVariation,FLAIR_ED_Intensity_Energy,FLAIR_ED_Intensity_InterQuartileRange,FLAIR_ED_Intensity_Kurtosis,FLAIR_ED_Intensity_Maximum,FLAIR_ED_Intensity_Mean,FLAIR_ED_Intensity_MeanAbsoluteDeviation,FLAIR_ED_Intensity_Median,FLAIR_ED_Intensity_MedianAbsoluteDeviation,FLAIR_ED_Intensity_Minimum,...,DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeMean,DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeNonUniformity,DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeNoneUniformityNormalized,DSC_PH_ED_GLSZM_Bins-16_Radius-1_ZoneSizeVariance,DSC_PH_ED_NGTDM_Busyness,DSC_PH_ED_NGTDM_Coarsness,DSC_PH_ED_NGTDM_Complexity,DSC_PH_ED_NGTDM_Contrast,DSC_PH_ED_NGTDM_Strength,DSC_PH_ED_LBP_Radius-1_Bins-16_LBP
count,611.0,611.0,611.0,611.0,611.0,611.0,611.0,611.0,611.0,611.0,...,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0
mean,0.177191,1127312000.0,35.837971,3.032956,229.0982,135.454958,2.925181e-14,134.855974,0.598984,45.02455,...,58.045716,176.416912,0.205802,607586.4,427.045984,6.9e-05,589.350564,0.228271,0.010954,407506.5
std,0.038226,1031441000.0,11.801011,1.7171,26.825426,26.218925,2.643037e-12,27.521536,3.525931,26.146518,...,28.852851,81.635495,0.031226,765498.6,323.308102,0.000101,104.676187,0.154988,0.012995,268763.3
min,0.068381,9612786.0,12.0,1.639673,126.0,65.147024,-1.236613e-11,61.0,-10.643813,0.0,...,7.011331,24.636364,0.115425,253.226,12.411156,8e-06,397.386313,0.023591,0.001321,22019.67
25%,0.151365,364593400.0,27.0,2.1687,210.5,117.551357,-9.703633e-13,116.0,-1.715516,25.5,...,35.958477,116.520142,0.186067,74591.86,164.049969,2.1e-05,524.03611,0.115718,0.00421,184251.9
50%,0.178013,832762200.0,34.0,2.526522,236.0,133.769685,-2.018293e-14,133.0,1.116505,45.0,...,56.924859,169.423355,0.201311,292503.1,340.344573,3.5e-05,574.004535,0.187562,0.007113,351812.4
75%,0.200565,1562309000.0,43.0,3.2932,255.0,151.565585,7.766749e-13,152.0,2.95894,63.0,...,75.464692,218.775342,0.22111,906460.5,635.443716,7.2e-05,632.036656,0.302372,0.01215,570852.3
max,0.344314,7881971000.0,75.0,25.310021,255.0,211.227767,2.237678e-11,219.0,13.528168,142.0,...,182.59596,594.034079,0.332567,5225610.0,1601.244686,0.000897,1256.282063,1.098542,0.109703,1256777.0


In [None]:
display(data.isna().sum())

SubjectID                                      0
FLAIR_ED_Intensity_CoefficientOfVariation      0
FLAIR_ED_Intensity_Energy                      0
FLAIR_ED_Intensity_InterQuartileRange          0
FLAIR_ED_Intensity_Kurtosis                    0
                                            ... 
DSC_PH_ED_NGTDM_Coarsness                    137
DSC_PH_ED_NGTDM_Complexity                   137
DSC_PH_ED_NGTDM_Contrast                     137
DSC_PH_ED_NGTDM_Strength                     137
DSC_PH_ED_LBP_Radius-1_Bins-16_LBP           137
Length: 4753, dtype: int64