# Data Exploration

author = Caroline Magg <br>
date = 31 March 2020 <br>

___________________________________
history: <br>
2020-03-31 <br>
inspect single folder content <br>
write methods for reading structure, contour names and contour content <br>
2020-04-02 <br>
generate list of contours <br>
2020-04-12 <br>
change utils_read to utils_explore and adapt to folderstructure
2020-09-16 <br>
generate list of contours for all folders <br>

In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cv2
import pydicom
import vtk

In [None]:
%matplotlib inline

### Add paths and dependencies
This can vary depending on your environment

In [None]:
# add KidsBrainProject main folder to paths
sys.path.append(os.path.abspath('../../'))
sys.path.append(os.path.abspath('../utils/'))

In [None]:
# add path to data here
path_data = "../../Data/" 

In [None]:
from utils_explore import read_structure,read_contour,read_contour_names,read_contour_row

# Inspect single folder content

In [None]:
idx = 1
folder_name = os.path.join(path_data, str(idx))
folder = os.listdir(folder_name)
folder

### CT

In [None]:
folder_ct_name = os.path.join(path_data, str(idx),'CT')
folder_ct = os.listdir(folder_ct_name)
folder_ct[:5]

In [None]:
df_ct = read_structure(folder_ct_name)
len(df_ct)

### MRI

In [None]:
folder_mri_name = os.path.join(path_data, str(idx),'T1 +C 3-15-16')
folder_mri = os.listdir(folder_mri_name)
folder_mri[:5]

In [None]:
df_mri = read_structure(folder_mri_name)
len(df_mri)

### Contours

In [None]:
path_contours = os.path.join(path_data, str(idx), 'RS.Jacks1.dcm')

In [None]:
df_contours = read_contour_names(path_contours)
len(df_contours)

In [None]:
contours = read_contour(path_contours, df_contours)
len(contours)

# Inspect 20 folders and generate contours list

In [None]:
contours_all = []
for idx in range(1,21):
    folder_name = os.path.join(path_data, str(idx))
    rs_file = [x for x in os.listdir(folder_name) if 'RS' in x]    
    path_contours = os.path.join(path_data, str(idx), rs_file[0])
    contours_all.append(read_contour_names(path_contours))
len(contours_all)

In [None]:
df_all = pd.DataFrame(columns=['ID','RoiNumber','RoiName','Count'])
for i in range(len(contours_all)):
    df_single = contours_all[i]
    for j in range(len(df_single)):
        row = df_single.loc[j]
        if row['RoiName'] not in df_all['RoiName'].values:
            row['Count'] = [i+1]
            df_all = df_all.append(row)
        else:
            idx = np.where(df_all['RoiName'] == row['RoiName'])[0][0]
            row = df_all.iloc[idx]
            row['Count'].append(i+1)

In [None]:
len(df_all)

In [None]:
df_all

In [None]:
np.unique(df_all['RoiName'])

In [None]:
df_all.to_csv('all_unique_contours_all_folders.csv',index=False, sep=';')

### Make overview of occurrence for report

In [None]:
counts = [len(x) for x in df_all['Count'].values]

In [None]:
h = np.histogram(counts, bins=range(1,22))
h, len(h[0]), len(h[1])

In [None]:
for c, o in zip(h[0], h[1]):
    print(o, c)

In [None]:
h

In [None]:
149+16+10+11+7

In [None]:
43+3+2

In [None]:
plt.hist(counts, bins=range(1,22))
plt.xticks([x+0.5 for x in list(range(1,21))],range(1,22))
plt.xlabel('# Occurrence')
plt.ylabel('# Structures')
plt.title('Occurrences of structures in dataset')
plt.show()