In [1]:
import os
import sys
os.chdir('/local/home/dhaziza/entrack')
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="4"
sys.path.append('/local/home/dhaziza/entrack/')

import src.features as ft_def
import csv
import re
import glob
import pickle

def intvals(s):
    vals = re.findall(r'\d+', s)
    return [int(i) for i in vals]

def intval(s):
    vals = intvals(s)
    return vals[0]

def boolval(cond):
    return 1 if cond else 0

def print_stats(dataset, name, field, cond):
    subject_ids = set()
    num = 0.0
    total_value = 0.0
    for row in dataset:
        if cond(row):
            num += 1
            total_value += row[field]
            subject_ids.add(row[ft_def.STUDY_PATIENT_ID])

    if num == 0:
        print('!! NO DATA FOR %s' % name)
        return
    value_mean = total_value/num

    value_std = 0.0
    for row in dataset:
        if cond(row):
            value_std += (row[field] - value_mean)*(row[field] - value_mean)
    value_std /= num
    print('%s Mean %s %f, variance %f [%d entries / %d unique patients]' % (
        name, field, value_mean, value_std, num, len(subject_ids)))

In [2]:
# Koln data
csv_orig = 'data/raw/csv/orig/koln.csv'
csv_output = 'data/raw/csv/koln.csv'

data = []
with open(csv_orig) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append({
            ft_def.AGE: int(row['Alter']),
            'health_pd': 1,
            ft_def.HEALTHY: 0,
            ft_def.SEX: int(row['Geschlecht (male =1; female =2)'])-1,
            ft_def.STUDY_PATIENT_ID: intval(row['ID'])
        })

with open(csv_output, "wb+") as csvfile:
    writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
    writer.writeheader()
    for row in data:
        writer.writerow(row)

print_stats(data, 'KOLN', 'age', lambda r: True)

KOLN Mean age 63.546875, variance 83.904053 [128 entries / 128 unique patients]


In [3]:
# ADNI AIBL
csv_orig = 'data/raw/csv/orig/adni_aibl.csv'

ADNI_AIBL_GROUPS = ['Normal', 'AD', 'EMCI', 'LMCI', 'MCI', 'SMC']
def convert_adni_aibl(csv_output, cond):
    print('ADNI_AIBL: Converting to %s' % (csv_output))
    data = []
    with open(csv_orig) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Reconstruct a single int subject id from something like '002_S_0295'
            subject_id = intvals(row['Subject ID'])
            subject_id = subject_id[0]*10000 + subject_id[1]
            group = row['DX Group']
            if group not in ADNI_AIBL_GROUPS:
                continue
            cur_data = {
                ft_def.AGE: intval(row['Age']),
                'health_ad': boolval(group == 'AD'),
                'health_emci': boolval(group == 'EMCI'),
                'health_lmci': boolval(group == 'LMCI'),
                'health_mci': boolval(group == 'MCI'),
                'health_smc': boolval(group == 'SMC'),
                ft_def.HEALTHY: boolval(group == 'Normal'),
                ft_def.SEX: 0 if row['Sex'] == 'M' else 1,
                ft_def.STUDY_IMAGE_ID: int(row['Image ID']),
                ft_def.STUDY_PATIENT_ID: subject_id,
            }
            if not cond(row, cur_data):
                continue
            data.append(cur_data)

    with open(csv_output, "wb+") as csvfile:
        writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    name = os.path.basename(csv_output).split('.csv')[0]
    print_stats(data, '*', 'age', lambda x: True)
    for ft in ['health_ad', ft_def.HEALTHY]:
        print_stats(data, ft, 'age', lambda row: row[ft] == 1)
    print('')

def is_ad_or_hc(f):
    return f['health_ad'] == 1 or f[ft_def.HEALTHY] == 1

convert_adni_aibl('data/raw/csv/adni_aibl.csv', lambda r, f: True)
convert_adni_aibl('data/raw/csv/adni_aibl__ad_hc.csv', lambda r, f: is_ad_or_hc(f))
convert_adni_aibl('data/raw/csv/adni_aibl__ad_hc__1.5T.csv', lambda r, f: is_ad_or_hc(f) and r['Imaging Protocol'] == 'Field Strength=1.5')

ADNI_AIBL: Converting to data/raw/csv/adni_aibl.csv
* Mean age 74.984580, variance 53.581205 [19001 entries / 1827 unique patients]
health_ad Mean age 75.367903, variance 62.092505 [2642 entries / 359 unique patients]
healthy Mean age 76.501196, variance 35.881413 [5437 entries / 432 unique patients]

ADNI_AIBL: Converting to data/raw/csv/adni_aibl__ad_hc.csv
* Mean age 76.130585, variance 44.735640 [8079 entries / 791 unique patients]
health_ad Mean age 75.367903, variance 62.092505 [2642 entries / 359 unique patients]
healthy Mean age 76.501196, variance 35.881413 [5437 entries / 432 unique patients]

ADNI_AIBL: Converting to data/raw/csv/adni_aibl__ad_hc__1.5T.csv
* Mean age 77.207583, variance 38.483667 [4167 entries / 431 unique patients]
health_ad Mean age 75.707483, variance 57.387601 [1323 entries / 200 unique patients]
healthy Mean age 77.905415, variance 28.155962 [2844 entries / 231 unique patients]



In [4]:
# Erasmus data
csv_orig = '/local/ERSM/ADNI_Diagnosis_mciconv.csv'

ERASMUS_ADNI_GROUPS = ['CN', 'AD', 'MCI']
def convert_erasmus_adni(csv_output, cond):
    print('ERASMUS_ADNI: Converting to %s' % (csv_output))
    data = []
    i = 0
    with open(csv_orig) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Reconstruct a single int subject id from something like '002_S_0295'
            subject_label = row['PTID']
            group = row['Diagnosis']
            if group not in ADNI_AIBL_GROUPS:
                continue
            cur_data = {
                'health_ad': boolval(group == 'AD'),
                'health_mci': boolval(group == 'MCI'),
                ft_def.HEALTHY: boolval(group == 'CN'),
                ft_def.IMAGE_LABEL: subject_label,
                ft_def.STUDY_PATIENT_ID: i,
            }
            i += 1
            if not cond(row, cur_data):
                continue
            data.append(cur_data)

    with open(csv_output, "wb+") as csvfile:
        writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    name = os.path.basename(csv_output).split('.csv')[0]
    print_stats(data, '*', 'healthy', lambda x: True)
    for ft in ['health_ad', 'health_mci', ft_def.HEALTHY]:
        print_stats(data, ft, ft, lambda row: row[ft] == 1)
    print('')

convert_erasmus_adni('data/raw/csv/erasmus_adni.csv', lambda r, d: True)

ERASMUS_ADNI: Converting to data/raw/csv/erasmus_adni.csv
* Mean healthy 0.000000, variance 0.000000 [1320 entries / 1320 unique patients]
health_ad Mean health_ad 1.000000, variance 0.000000 [342 entries / 342 unique patients]
health_mci Mean health_mci 1.000000, variance 0.000000 [978 entries / 978 unique patients]
!! NO DATA FOR healthy



In [5]:
# ADNI/AIBL: sai dataset
train_ad_nc = pickle.load(open('/local/ADNI_AIBL/ADNI_AIBL_T1_normalized/py2/AIBL_ADNI_train_T1_NC_AD.pkl', 'r'))
valid_ad_nc = pickle.load(open('/local/ADNI_AIBL/ADNI_AIBL_T1_normalized/py2/AIBL_ADNI_valid_T1_NC_AD.pkl', 'r'))
csv_output = 'data/raw/csv/adni_aibl__ad_hc__sai.csv'

print('ADNI_AIBL[sai_ad_nc]: Converting to %s' % (csv_output))
data = []
for image_label, is_ad in train_ad_nc.items():
    data.append({
        ft_def.IMAGE_LABEL: image_label,
        'health_ad': is_ad,
        ft_def.HEALTHY: boolval(is_ad == 0),
        ft_def.SEX: -1,
        ft_def.AGE: -1,
        ft_def.DATASET: 'train',
    })
for image_label, is_ad in valid_ad_nc.items():
    data.append({
        ft_def.IMAGE_LABEL: image_label,
        'health_ad': is_ad,
        ft_def.HEALTHY: boolval(is_ad == 0),
        ft_def.SEX: -1,
        ft_def.AGE: -1,
        ft_def.DATASET: 'test',
    })

with open(csv_output, "wb+") as csvfile:
    writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
    writer.writeheader()
    for row in data:
        writer.writerow(row)

ADNI_AIBL[sai_ad_nc]: Converting to data/raw/csv/adni_aibl__ad_hc__sai.csv


In [6]:
# PPMI - datakey is image_id
# Example:
# <subject_id>/Axial_PD-T2_TSE_FS/2013-04-09_09_24_46.0/S<serie_id>/
# ... PPMI_4139_MR_Axial_PD-T2_TSE_FS_br_raw_20130625124532171_76_S<serie_id>_I<image_id>.nii
import xml.etree.ElementTree as ET
csv_output = 'data/raw/csv/ppmi.csv'

image_sizes = {}
ppmi_path = '/local/PPMI/raw/'
data = []
for f in glob.glob(ppmi_path + '*.xml'):
    tree = ET.parse(f)
    root = tree.getroot()
    def elem_unique(path):
        e = root.findall(path)
        assert(len(e) == 1)
        return e[0]
    sex_text = elem_unique("./project/subject/subjectSex").text
    age_text = elem_unique("./project/subject/study/subjectAge").text
    research_group = elem_unique("./project/subject/researchGroup").text
    image_id_text = elem_unique("./project/subject/study/imagingProtocol/imageUID").text
    subject_id_text = elem_unique("./project/subject/subjectIdentifier").text
    post_mortem_text = elem_unique("./project/subject/study/postMortem").text
    weighting = elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Weighting']").text
    assert(post_mortem_text == 'F')
    assert(weighting in ['T1', 'T2'])
    if research_group not in ['Control', 'PD', 'GenCohort PD', 'GenCohort Unaff', 'Prodromal', 'SWEDD']:
        continue
    data.append({
        ft_def.AGE: intval(age_text),
        'health_pd': boolval(research_group == 'PD'),
        'health_prodromal': boolval(research_group == 'Prodromal'),
        'health_swedd': boolval(research_group == 'SWEDD'),
        'health_gencohort_unaff': boolval(research_group == 'GenCohort Unaff'),
        'health_gencohort_pd': boolval(research_group == 'GenCohort PD'),
        ft_def.HEALTHY: boolval(research_group == 'Control'),
        ft_def.SEX: 0 if sex_text == 'M' else 1,
        ft_def.STUDY_IMAGE_ID: int(image_id_text),
        ft_def.STUDY_PATIENT_ID: int(subject_id_text),
    })
    # Also count how many samples per image size
    img_size = (
        elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Matrix X']").text,
        elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Matrix Y']").text,
        elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Matrix Z']").text,
    )
    if not img_size in image_sizes:
        image_sizes[img_size] = 0
    image_sizes[img_size] += 1

with open(csv_output, "wb+") as csvfile:
    writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
    writer.writeheader()
    for row in data:
        writer.writerow(row)

print('%s different sizes found!' % (len(image_sizes)))
print_stats(data, 'PPMI', 'age', lambda x: True)
print_stats(data, 'PPMI/PD', 'age', lambda x: x['health_pd'])
print_stats(data, 'PPMI/Control', 'age', lambda x: x[ft_def.HEALTHY])

171 different sizes found!
PPMI Mean age 61.554399, variance 99.253863 [2785 entries / 976 unique patients]
PPMI/PD Mean age 61.562108, variance 94.287548 [1594 entries / 402 unique patients]
PPMI/Control Mean age 60.155372, variance 128.127926 [605 entries / 182 unique patients]
