# Import data

In [2]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from glob import glob
import plotly.graph_objs as go
import plotly
plotly.offline.init_notebook_mode(connected=True)
import torch
import torchtuples as tt
from pycox.evaluation import EvalSurv
%load_ext autoreload
%autoreload 2

In [3]:
data_files = [filename for filename in Path('../data').glob('*.csv')]

In [4]:
data_files

[PosixPath('../data/all_subjects_cleaned.csv'),
 PosixPath('../data/all_scans.csv'),
 PosixPath('../data/all_subjects.csv'),
 PosixPath('../data/final_preoperative_dataset.csv')]

In [5]:
all_subjects_df = pd.read_csv(data_files[2])

In [6]:
all_subjects_df = all_subjects_df.drop("Unnamed: 0", axis=1)

In [None]:
all_subjects_df.head()

In [None]:
all_subjects_df.columns

In [8]:
print("Total patients: {}".format(len(all_subjects_df)))

Total patients: 1556


# Filter

## Filter hospitals

In [9]:
hospitals = all_subjects_df['group'].unique().tolist()

In [11]:
filtered_groups = [hospital for hospital in hospitals if 'LGG' not in str(hospital)]

In [12]:
all_subjects_filtered = all_subjects_df.loc[all_subjects_df['group'].isin(filtered_groups), :]

In [13]:
print("Total patients: {}".format(len(all_subjects_filtered)))

Total patients: 1430


In [None]:
all_subjects_filtered.loc[:, 'index'] = all_subjects_filtered.index.tolist()

In [15]:
all_subjects_filtered.loc[:, 'group'] = all_subjects_filtered['group'].apply(lambda x: 'Unknown' if type(x) is not str else x)

## Number of subjects per hospital

In [16]:
all_subjects_filtered = all_subjects_filtered.loc[all_subjects_filtered['group'] != 'Unknown', :]

In [17]:
group_count = all_subjects_filtered.loc[:, ['index', 'group']].groupby('group').count()

In [19]:
sortx = [x for _,x in sorted(zip(group_count['index'],group_count.index), reverse=True)]
sorty = sorted(group_count['index'], reverse=True)

trace = go.Bar(x=sortx, y=sorty)
layout = go.Layout(title='Subjects per group', 
                   title_x=0.5, xaxis_title = 'Hospital',
                   yaxis_title = 'Subject Count')

fig = go.Figure([trace], layout=layout)

## Exclusion Reasons

In [21]:
all_subjects_filtered.loc[all_subjects_filtered['ExclusionReason'].notnull(), ['ExclusionReason', 'group']].groupby('ExclusionReason').count()

Unnamed: 0_level_0,group
ExclusionReason,Unnamed: 1_level_1
gliosarcoma,2
infratentorialGBM,12
noCElesion,5
noClinVars,43
noMRI,110
noUsableMRI,8
notIncluded,8
pediatric,2
reORGBM,33
secondaryGBM,40


In [22]:
excluded_subjects_indx = all_subjects_filtered.loc[all_subjects_filtered['ExclusionReason'].notnull(), 'ExclusionReason'].index

In [23]:
all_subjects_filtered = all_subjects_filtered.drop(excluded_subjects_indx, axis=0)

In [24]:
all_subjects_filtered = all_subjects_filtered.reset_index()
all_subjects_filtered = all_subjects_filtered.drop('level_0', axis=1)

In [25]:
print("Total Patients: {}".format(len(all_subjects_filtered)))

Total Patients: 1142


In [26]:
group_count = all_subjects_filtered.loc[:, ['index', 'group']].groupby('group').count()

In [1]:
sortx = [x for _,x in sorted(zip(group_count['index'],group_count.index), reverse=True)]
sorty = sorted(group_count['index'], reverse=True)

trace = go.Bar(x=sortx, y=sorty)
layout = go.Layout(title='Subjects per group', 
                   title_x=0.5, xaxis_title = 'Hospital',
                   yaxis_title = 'Subject Count')

fig = go.Figure([trace], layout=layout)
fig

## Fix Missings age of some patients

**Surgerydate to datetime**

In [28]:
all_subjects_filtered['SurgeryDate'] = pd.DatetimeIndex(all_subjects_filtered['SurgeryDate'])

In [29]:
missing_age = all_subjects_filtered.loc[(all_subjects_filtered['age'].isnull()) 
                          & (all_subjects_filtered['BirthYear'].notnull()) 
                          & (all_subjects_filtered['SurgeryDate'].notnull()), :]

In [None]:
missing_age.loc[:, 'surg_year'] = missing_age['SurgeryDate'].apply(lambda x: x.year)

**Calculate age at surgery**

In [31]:
missing_age.loc[:, 'age'] = missing_age.loc[:, 'surg_year'] - all_subjects_filtered.loc[:, 'BirthYear']

**Fix age**

In [32]:
all_subjects_filtered.update(missing_age['age'])

**Drop patients with no age**

In [33]:
no_age = all_subjects_filtered.loc[all_subjects_filtered['age'].isnull(), 'age'].index

In [34]:
all_subjects_filtered = all_subjects_filtered.drop(no_age, axis=0)

# Preoperative

## Important variables

In [35]:
all_subjects_filtered.loc[:, 'SurgeryExtend'] = all_subjects_filtered['SurgeryExtend'].apply(lambda x: 0 if x == 'biopsy' else 1)

In [36]:
preop_variables = ['age', 'ENTvolML', 'ENTside', 'GenderV2', 'KPSpre', 'Chemo','SurgeryExtend', 'surv', 'DeathObserved', 'ID']

**Remove patients with no survival data and where survival is smaller than 0**

In [37]:
all_subjects_filtered.loc[all_subjects_filtered['DeathObserved'].isnull(), 'DeathObserved'] = False

In [38]:
final_df = all_subjects_filtered.loc[(all_subjects_filtered['surv'].notnull()) & (all_subjects_filtered['surv'] > 0), preop_variables]

In [42]:
final_df.to_csv('final_preoperative_dataset_ids.csv', index=False)

In [41]:
print("Total Patients: {}".format(len(final_df)))

Total Patients: 1036
