In [193]:
%matplotlib inline
import pandas as pd
import numpy as np

In [194]:
df = pd.read_csv('GBM_analysis.csv')

* Sex - 0 = M
* DxAge = Age at Diagnosis
* Time = How many days they survived from diagnosis
* Status = alive or dead, 0 = alive
* SurgicalAim = B = biopsy, DB = debulking, PR = Partial Resection, GTR = Gross Total Resection (ideally we change PR to STR, subtotal resection)
* Tside = 0 = left
* Tlocation = location of tumour, thal = thalamus, in = insula, fron = frontal, pari = parietal, temp = temporal, occi = occipital, ext = external capsule, corp = corpus callosum, 
* IDHSatua = IDH-1 Status, 1 = +ve
* MeanMGMT = Mean MGMT level
* MGMT>10 = valve of Mean MGMT >10 - standard used by Addies
* Pspreop = Performance status pre-op
* Pspostop = Performance status post-op
* DurationSx = Duration of symptoms prior to presentation
* SxDeficit = Neurological deficit
* SxHeadache = Headache
* SxSeizure = Seizure
* Chemo = 0 = none, 1 = TMZ  = all having 60GY RT will have chemo.
* RTCourse = N = No, 30, 40, 60

In [195]:
cols = [
    'pt_number',
    'sex', 
    'diagnosis_age', 
    'days_survived',
    'survived_18months',
    'dead', 
    'treatment',
    'gross_total_resection',
    'tumor_side', 
    'tumor_location', 
    'idh_status',
    'mgmt_mean', 
    'mgmt_threshold', 
    'ps_preop', 
    'ps_postop', 
    'sx_duration', 
    'sx_deficit',
    'sx_headache',
    'sx_seizure',
    'chemo',
    'radiotherapy'
]
df.columns = cols
df.head()

Unnamed: 0,pt_number,sex,diagnosis_age,days_survived,survived_18months,dead,treatment,gross_total_resection,tumor_side,tumor_location,...,mgmt_mean,mgmt_threshold,ps_preop,ps_postop,sx_duration,sx_deficit,sx_headache,sx_seizure,chemo,radiotherapy
0,487,0,67,3,N,1,B,0,0,thal,...,,,2,,10,1,0,0,0,N
1,228,0,66,5,N,1,B,0,0,in,...,,,1,4.0,91,0,0,1,0,N
2,480,0,61,5,N,1,B,0,0,in,...,,,0,,14,0,1,0,0,N
3,186,0,64,7,N,1,GTR,1,1,fron,...,64.0,1.0,1,,28,1,0,0,0,N
4,239,0,27,8,N,1,B,0,1,pari,...,,,4,4.0,56,1,0,0,0,N


In [196]:
df = df.drop(['pt_number', 'gross_total_resection', 'mgmt_threshold', 'survived_18months'], axis=1)
df = df.dropna(subset=['days_survived', 'dead'])
df['idh_status'] = df['idh_status'].fillna(0)
df['radiotherapy'] = df['radiotherapy'].fillna(0)

In [197]:
df['sex'] = df['sex'].replace([0,1], ['male', 'female'])
df['tumor_side'] = df['tumor_side'].replace([0,1],['left','right'])
df['treatment'] = df['treatment'].replace(
    [
        'B', 
        'GTR', 
        'DB', 
        'PR'
    ], 
    [
        'Biopsy',
         'Gross Total Resection', 
         'Debulking',
         'Subtotal Resection'
    ])
df['tumor_location'] = df['tumor_location'].replace(
    [
        'thal',
        'in',
        'fron',
        'temp',
        'occi',
        'ext',
        'corp',
        'pari',
        'cere'
    ], 
    [
        'Thalamus',
        'Insula',
        'Frontal lobe',
        'Temporal lobe',
        'Occipital lobe',
        'External capsule',
        'Corpus callosum',
        'Parietal lobe',
        'Cerebellum'
    ])
df['radiotherapy'] = df['radiotherapy'].replace(['N','30','40','60'], [0, 30, 40, 60])
df['tumor_side'] = df['tumor_side'].replace(['0','1'], ['left', 'right'])

In [198]:
df['sx_main'] = ''
df.loc[df['sx_deficit']==1.0, 'sx_main'] = 'deficit'
df.loc[df['sx_headache']==1.0, 'sx_main'] = 'headache'
df.loc[df['sx_seizure']==1.0, 'sx_main'] = 'seizure'
df = df.drop(['sx_deficit', 'sx_headache', 'sx_seizure'], axis=1)

In [199]:
pd.options.display.float_format = '{:,.0f}'.format
df.dtypes

sex                object
diagnosis_age       int64
days_survived     float64
dead              float64
treatment          object
tumor_side         object
tumor_location     object
idh_status        float64
mgmt_mean         float64
ps_preop          float64
ps_postop         float64
sx_duration       float64
chemo               int64
radiotherapy        int64
sx_main            object
dtype: object

In [200]:
df['days_survived'] = df['days_survived'].astype('int64')
df['dead'] = df['dead'].astype('int64')
df['idh_status'] = df['idh_status'].astype('int64')
# df['ps_preop'] = df['ps_preop'].astype('int64')
# df['ps_postop'] = df['ps_postop'].astype('int64')
# df['sx_duration'] = df['sx_duration'].astype('int64')

In [201]:
df.sample(5)

Unnamed: 0,sex,diagnosis_age,days_survived,dead,treatment,tumor_side,tumor_location,idh_status,mgmt_mean,ps_preop,ps_postop,sx_duration,chemo,radiotherapy,sx_main
18,male,83,28,1,Biopsy,left,Frontal lobe,0,18,,2,42,0,0,deficit
477,female,73,1113,1,Subtotal Resection,right,Frontal lobe,0,46,2.0,2,14,1,60,deficit
305,male,76,298,1,Gross Total Resection,right,Occipital lobe,0,5,1.0,3,1,1,60,deficit
270,female,63,250,1,Biopsy,left,Frontal lobe,0,42,0.0,0,7,1,60,deficit
79,female,75,74,0,Debulking,left,Frontal lobe,0,47,,2,1,0,30,deficit


In [202]:
df['alive_6months'] = 0
df['alive_12months'] = 0
df['alive_18months'] = 0
df.loc[df['days_survived'] >= 30*6, 'alive_6months'] = 1
df.loc[df['days_survived'] >= 30*12, 'alive_12months'] = 1
df.loc[df['days_survived'] >= 30*18, 'alive_18months'] = 1


In [203]:
df.describe()

Unnamed: 0,diagnosis_age,days_survived,dead,idh_status,mgmt_mean,ps_preop,ps_postop,sx_duration,chemo,radiotherapy,alive_6months,alive_12months,alive_18months
count,489,489,489,489,464,429,403,421,489,489,489,489,489
mean,62,296,1,0,19,1,1,39,0,36,1,0,0
std,11,267,0,0,20,1,1,76,1,25,0,0,0
min,18,3,0,0,1,0,0,0,0,0,0,0,0
25%,55,107,1,0,4,0,1,7,0,0,0,0,0
50%,63,224,1,0,7,1,1,14,0,40,1,0,0
75%,70,384,1,0,35,1,2,42,1,60,1,1,0
max,84,1585,1,1,93,4,4,730,1,60,1,1,1


In [204]:
df.to_csv('data/GBM_clean.csv', index=False)

In [205]:
df.dropna().describe()

Unnamed: 0,diagnosis_age,days_survived,dead,idh_status,mgmt_mean,ps_preop,ps_postop,sx_duration,chemo,radiotherapy,alive_6months,alive_12months,alive_18months
count,304,304,304,304,304,304,304,304,304,304,304,304,304
mean,61,326,1,0,20,1,1,34,1,41,1,0,0
std,12,270,0,0,20,1,1,61,0,23,0,0,0
min,19,11,0,0,2,0,0,0,0,0,0,0,0
25%,54,133,1,0,4,0,0,7,0,30,0,0,0
50%,63,256,1,0,7,1,1,14,1,60,1,0,0
75%,69,442,1,0,36,1,2,42,1,60,1,1,0
max,84,1585,1,1,93,4,4,730,1,60,1,1,1
