# Imports

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.graph_objects as go

# Load Data

In [3]:
path_folder = Path("../data")

In [4]:
files = list(path_folder.glob('all_*.csv'))

In [None]:
files

# Subject Data

In [6]:
subject_df = pd.read_csv(files[3])
subject_df = subject_df.drop('Unnamed: 0', axis=1)

In [None]:
subject_df.head()

In [None]:
subject_df.columns

In [None]:
subject_df.describe()

In [8]:
print("Total patients: {}".format(len(subject_df)))

Total patients: 1451


In [3]:
subject_df.loc[:, ['ID', 'GenderV2']].groupby('GenderV2').count()

In [9]:
subject_df = subject_df.loc[~subject_df['surv'].isna()]

## Filter

In [10]:
hospitals = subject_df['group'].unique().tolist()

**Filter out Laag Gradig tumoren**

In [12]:
filtered_groups = [hospital for hospital in hospitals if 'LGG' not in str(hospital)]

In [14]:
all_subjects_filtered = subject_df.loc[subject_df['group'].isin(filtered_groups), :]

In [15]:
print("Total patients: {}".format(len(all_subjects_filtered)))

Total patients: 1239


In [16]:
all_subjects_filtered.loc[:, 'index'] = all_subjects_filtered.index.tolist()

In [17]:
all_subjects_filtered.loc[:, 'group'] = all_subjects_filtered['group'].apply(lambda x: 'Unknown' if type(x) is not str else x)

# Descriptives

## Number of patients per hospital

In [18]:
all_subjects_filtered = all_subjects_filtered.loc[all_subjects_filtered['group'] != 'Unknown', :]

In [19]:
group_count = all_subjects_filtered.loc[:, ['index', 'group']].groupby('group').count().rename(columns={'index': 'count'})

In [21]:
sortx = [x for _,x in sorted(zip(group_count['count'],group_count.index), reverse=True)]
sorty = sorted(group_count['count'], reverse=True)

trace = go.Bar(x=sortx, y=sorty)
layout = go.Layout(title='Subjects per hospital', 
                   title_x=0.5, xaxis_title = 'Hospital',
                   yaxis_title = 'Subject Count')

fig = go.Figure([trace], layout=layout)

## Exclusion Reasons

In [23]:
all_subjects_filtered.loc[all_subjects_filtered['ExclusionReason'].notnull(), ['ExclusionReason', 'group']].groupby('ExclusionReason').count().rename(columns={'group': 'count'})

Unnamed: 0_level_0,count
ExclusionReason,Unnamed: 1_level_1
gliosarcoma,2
infratentorialGBM,8
noCElesion,5
noClinVars,1
noMRI,100
noUsableMRI,6
notIncluded,3
reORGBM,23
secondaryGBM,30
surgElsewhere,18


In [24]:
excluded_subjects_indx = all_subjects_filtered.loc[all_subjects_filtered['ExclusionReason'].notnull(), 'ExclusionReason'].index

In [25]:
all_subjects_filtered = all_subjects_filtered.drop(excluded_subjects_indx, axis=0)

In [26]:
all_subjects_filtered = all_subjects_filtered.reset_index()
all_subjects_filtered = all_subjects_filtered.drop('level_0', axis=1)

In [27]:
print("Total Patients: {}".format(len(all_subjects_filtered)))

Total Patients: 1043


In [43]:
group_count = all_subjects_filtered.loc[:, ['index', 'group']].groupby('group').count()

In [1]:
sortx = [x for _,x in sorted(zip(group_count['index'],group_count.index), reverse=True)]
sorty = sorted(group_count['index'], reverse=True)

trace = go.Bar(x=sortx, y=sorty)
layout = go.Layout(title='Patients per hospital', 
                   title_x=0.5, xaxis_title = 'Hospital',
                   yaxis_title = 'Patient Count')

fig = go.Figure([trace], layout=layout)
fig

## Fix Missings age of some patients

**Surgerydate to datetime**

In [31]:
all_subjects_filtered['SurgeryDate'] = pd.DatetimeIndex(all_subjects_filtered['SurgeryDate'])

In [32]:
missing_age = all_subjects_filtered.loc[(all_subjects_filtered['age'].isnull()) 
                          & (all_subjects_filtered['BirthYear'].notnull()) 
                          & (all_subjects_filtered['SurgeryDate'].notnull()), :]

In [None]:
missing_age.loc[:, 'surg_year'] = missing_age['SurgeryDate'].apply(lambda x: x.year)

**Calculate age at surgery**

In [34]:
missing_age.loc[:, 'age'] = missing_age.loc[:, 'surg_year'] - all_subjects_filtered.loc[:, 'BirthYear']

**Fix age**

In [35]:
all_subjects_filtered.update(missing_age['age'])

**Drop patients with no age**

In [36]:
no_age = all_subjects_filtered.loc[all_subjects_filtered['age'].isnull(), 'age'].index

In [37]:
all_subjects_filtered = all_subjects_filtered.drop(no_age, axis=0)

In [38]:
print("Total Patients Final: {}".format(len(all_subjects_filtered)))

Total Patients Final: 1038


In [39]:
complete_ids = all_subjects_filtered['ID'].unique().tolist()

In [None]:
all_subjects_filtered.columns

In [41]:
all_subjects_filtered = all_subjects_filtered.loc[all_subjects_filtered['surv'] > 0]

In [None]:
all_subjects_filtered.describe()

In [2]:
all_subjects_filtered.loc[:, ['ID', 'GenderV2']].groupby('GenderV2').count()

# Scans

In [103]:
files

[PosixPath('../data/all_resources.csv'),
 PosixPath('../data/all_scans.csv'),
 PosixPath('../data/all_subjects.csv'),
 PosixPath('../data/all_experiments.csv')]

In [109]:
scans_df = pd.read_csv(files[3])

In [None]:
scans_df.columns

In [None]:
scans_df.loc[scans_df['ID'].isin(complete_ids)]