In [1]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Prep. Sagittal data

In [2]:
ordered_verts = ['T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'L1', 'L2', 'L3', 'L4']

In [7]:
sag_path = '/home/donal/PhD/initial_spines/CT_models/data/all_verts/'
data_list = '/home/donal/PhD/initial_spines/CT_models/data_lists/data_list_all_forviewing.txt'

In [31]:
def get_mask(pts_file):
    #Get vert. body annotations
    with open(pts_file, 'r') as f:
        text = f.readlines()
        lines = [line.strip() for line in text]
        start = lines.index('{')
        end = lines.index('}')
        x = [float(x.split(' ')[0]) for x in lines[start+1:end]]
        y = [float(x.split(' ')[1]) for x in lines[start+1:end]]
        points = (x, y)
    return points

def get_id():
    """
    Collect paths to point files, in a dict
    """
    sag_files = [file for file in os.listdir(sag_path)]
    pts_files = {}
    with open(data_list, 'r') as f:
        lines = f.readlines()
        lines = [line.strip() for line in lines]
        start = lines.index('{')
        end = lines.index('}')
        info_list = lines[start+1:end]
        for line in info_list:
            pts, img = line.split(':')
            id_ = pts.split('.')[0]
            vert_list = list(filter(lambda x: f'{id_}' in x, sag_files))
            pts_files[id_] = vert_list
    return pts_files

def get_points(pts_files):
    """
     Get + organise annotations
    """
    mask_dict = {} #All landmark annotations for making a mask of vert.
    for key, val in pts_files.items():
        name = f'{key}_kj'
        mask_dict[name] = {}
        for elem in val:
            # Iterate over available vert. annotations
            # Find name of vertebra
            name_split = re.findall('[0-9a-zA-Z][^A-Z]*',
                                    os.path.splitext(elem)[0])
            vert = name_split[-1].split('_')[0]
            # Get all landmark point annotations
            points = get_mask(os.path.join(sag_path, elem))
            mask_dict[name][vert] = points
    return mask_dict


In [25]:
pts_files = get_id()
print(f'Found {len(list(pts_files.keys()))} points files')
mask_dict = get_points(pts_files)
print(f'Found {len(list(mask_dict.keys()))} patients w. full vert. annotations.')

Found 402 points files
Found 402 patients w. full vert. annotations.


In [44]:
list(mask_dict.keys())[0]

'01_06_2014_363_Sag_midline_ij_5mm_flop_kj'

In [28]:
from sklearn.preprocessing import LabelBinarizer

In [52]:
def filter_masks(mask_dict, cor_points='../formatted_pts.csv', cor_dir='../data/parent_data/coronal_midline/mips/'):
    # Remove files that don't have matching coronal midline annotations
    # Since we can't extract sagittal midline
    df = pd.read_csv(cor_points, index_col=0, header=0)
    coronal_files = [file.split('.')[0] for file in os.listdir(cor_dir)]
    files = [file for file in df.index if file.replace('_kj', '') in coronal_files]
    print(len(files))
    out_dict = {key: val for key}
    

In [53]:
vert_list = []
# Convert vertebra names to one-hot
for val in mask_dict.values():
    vert_list.extend(list(val.keys()))
all_verts = list(np.unique(vert_list))
enc = LabelBinarizer()
enc.fit(all_verts)
enc.classes_ = ordered_verts
# Remove files that don't have corresponding midline annotations
filter_masks(mask_dict)

300
