In [1]:
import json
import os
import os.path
import pandas as pd

In [2]:
raw_age_at_visit = pd.read_csv(os.path.join('..', 'data', 'csv', 'Age_at_visit.csv'))
raw_u3_score = pd.read_csv(os.path.join('..', 'data', 'csv', 'MDS_UPDRS_Part_III_CAL.csv'))
#Deprecated
#raw_u3_on_off = pd.read_csv(os.path.join('..', 'data', 'csv', 'MDS-UPDRS_Part_III_ON_OFF_Determination___Dosing.csv'))
raw_demographic = pd.read_csv(os.path.join('..', 'data', 'csv', 'Demographics.csv'))
raw_img_info = pd.read_csv(os.path.join('..', 'data', 'T1PD.csv'))
raw_diag = pd.read_csv(os.path.join('..', 'data', 'csv', 'PD_Diagnosis_History.csv'))
raw_ledd = pd.read_csv(os.path.join('..', 'data', 'csv', 'LEDD_Concomitant_Medication_Log.csv'))

  raw_u3_score = pd.read_csv(os.path.join('..', 'data', 'csv', 'MDS_UPDRS_Part_III_CAL.csv'))


In [3]:
def apply_filter(x):
    # MED_ON
    on_score = list(x[x['PDSTATE'] == 'ON']['NP3TOT'])
    if len(on_score) == 0:
        on_score = [None]
    # MED_OFF
    off_score = list(x[x['PDSTATE'] == 'OFF']['NP3TOT'])
    if len(off_score) == 0:
        off_score = [None]
    return pd.Series({'INFODT': list(x['INFODT'])[0], 'NUPDR3OF': off_score[0], 'NUPDR3ON': on_score[0]})

In [4]:
# Keep duplicate index
u3_dup_idx = raw_u3_score.duplicated(subset=['PATNO', 'EVENT_ID'], keep=False)
# Get duplicate records
u3_rec = raw_u3_score[u3_dup_idx][['PATNO', 'EVENT_ID', 'INFODT', 'PDSTATE', 'NP3TOT']].dropna().reset_index(drop=True)
# Generate U3 ON/OFF records by PATNO and EVENT_ID
u3_rec = u3_rec.groupby(['PATNO', 'EVENT_ID']).apply(apply_filter).reset_index().dropna().reset_index(drop=True)
# Get image id
image_meta = raw_img_info.rename(columns={'Image Data ID': 'IMG_ID', 'Subject': 'PATNO', 'Visit': 'EVENT_ID'})
# Merge U3 records and image id
data = pd.merge(u3_rec, image_meta, on=['PATNO', 'EVENT_ID'])[['PATNO', 'EVENT_ID', 'INFODT', 'NUPDR3OF', 'NUPDR3ON', 'IMG_ID']].reset_index(drop=True)

In [5]:
# Get image metadata file path
xmllist = []
for dirpath, dirnames, filenames in os.walk(os.path.join('..', 't1meta')):
    for filename in filenames:
        if dirpath == os.path.join('..', 't1meta'):
            continue
        xmllist.append(os.path.join(dirpath, filename))

In [6]:
# Generate image dataframe by PATNO and IMG_ID
img_path = []
from xml.dom import minidom
for xml in xmllist:
    root = minidom.parse(xml).documentElement
    subject = root.getElementsByTagName('subject')[0].getAttribute('id')
    study = root.getElementsByTagName('study')[0].getAttribute('uid')
    series = root.getElementsByTagName('series')[0].getAttribute('uid')
    image = root.getElementsByTagName('image')[0].getAttribute('uid')
    relative_path = os.path.join(xml.split(os.sep)[2], xml.split(os.sep)[3], xml.split(os.sep)[4], series)
    img_path.append({'PATNO': int(subject), 'IMG_ID': str(image), 'SERIES': str(series), 'IMG_REL_PATH': str(relative_path)})
img_path = pd.DataFrame(img_path)

In [7]:
# Merge image df and main df
data = pd.merge(data, img_path, on=['PATNO', 'IMG_ID'])[['PATNO', 'EVENT_ID', 'INFODT', 'NUPDR3OF', 'NUPDR3ON', 'IMG_ID', 'IMG_REL_PATH']].reset_index(drop=True)

In [8]:
# Merge age_at_visit df and main df to extract age by EVENT_ID
data = pd.merge(data, raw_age_at_visit, on=['PATNO', 'EVENT_ID'])[['PATNO', 'EVENT_ID', 'INFODT', 'NUPDR3OF', 'NUPDR3ON', 'IMG_ID', 'IMG_REL_PATH', 'AGE_AT_VISIT']].reset_index(drop=True)

In [9]:
# Merge demographic df and main df
data = pd.merge(data, raw_demographic.drop(labels=['EVENT_ID', 'INFODT'], axis=1), on=['PATNO'])[['PATNO', 'EVENT_ID', 'INFODT', 'NUPDR3OF', 'NUPDR3ON', 'IMG_ID', 'IMG_REL_PATH', 'AGE_AT_VISIT', 'SEX', 'ORIG_ENTRY']].reset_index(drop=True)

In [10]:
# Duration calculation function
def get_duration(rec):
    visit = rec.EVENT_ID
    id = rec.PATNO
    visit = rec.INFODT
    diag = raw_diag[raw_diag['PATNO'] == id]['PDDXDT'].iloc[0]
    visit = visit.split('/')
    diag = diag.split('/')
    return 12 * (int(visit[1]) - int(diag[1])) + int(visit[0]) - int(diag[0])

In [11]:
# Calculate score
data['SCORE'] = (data['NUPDR3OF'] - data['NUPDR3ON']) / data['NUPDR3OF']
# Calculate duration
data['DURATION'] = data.apply(get_duration, axis=1)
# Calculate categories
data['CAT'] = 1 * (data['SCORE'] >= 0.3)
# Generate unique key
data['KEY'] = data['PATNO'].astype(str) + data['EVENT_ID'] + data['IMG_ID']
# Reformat INFODT
data['INFODT'] = pd.to_datetime(data['INFODT'])

In [12]:
# LEDD extraction
# ???
ledd_rec = raw_ledd[['PATNO', 'LEDTRT', 'STARTDT', 'STOPDT', 'LEDD']].copy()
# Convert to date
ledd_rec['STARTDT'] = pd.to_datetime(ledd_rec['STARTDT'])
ledd_rec['STOPDT'] = pd.to_datetime(ledd_rec['STOPDT'])
# Fill blank stop date with current date
ledd_rec['STOPDT'] = ledd_rec['STOPDT'].fillna(pd.Timestamp.now())
# Drop duplicate records
ledd_rec = ledd_rec.dropna().drop_duplicates(subset=['LEDTRT', 'STARTDT', 'STOPDT', 'LEDD']).reset_index(drop=True)

from functools import reduce
def get_ledd(rec):
    date = rec.INFODT
    id = rec.PATNO
    ledd_history = ledd_rec[ledd_rec['PATNO'] == id]
    # Filter by date, records at start date are dropped
    ledd_history = ledd_history[(ledd_history['STARTDT'] < date) & (ledd_history['STOPDT'] >= date)]
    ledd_list = ledd_history['LEDD']
    # Check if value is float
    ledd_isfloat = list(map(lambda x: x.replace('.','',1).isdigit(), ledd_list))
    # Generate string index list
    ledd_notfloat = [not e for e in ledd_isfloat]
    ld = 0
    # Drop records without baseline ld value
    if len(ledd_list[ledd_isfloat]) == 0:
        return None
    # Sum all float values
    ld = float(reduce(lambda x, y: float(x)+float(y), ledd_list[ledd_isfloat]))
    # Return if no inhibitor is used
    if len(ledd_list[ledd_notfloat]) == 0:
        return ld
    # Replace LD in inhibitor string with ld value
    ledd_eval = list(map(lambda s: s.replace('LD', str(ld)), ledd_list[ledd_notfloat]))
    # Calculate inhibitor values
    ledd_eval = list(map(lambda s: s.replace('x', '*'), ledd_eval))
    ledd_eval = list(map(lambda s: float(eval(s)), ledd_eval))
    # Sum all available values
    ld += sum(ledd_eval)
    return ld

# Get LEDD for all records
data['LEDD'] = data.apply(get_ledd, axis=1)

In [13]:
# BL value
data.loc[data['KEY'] == '3826V04I395598', 'LEDD'] = 300 # Drop?
data = data.dropna().reset_index(drop=True)

In [14]:
data.describe()

Unnamed: 0,PATNO,NUPDR3OF,NUPDR3ON,AGE_AT_VISIT,SEX,SCORE,DURATION,CAT,LEDD
count,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0
mean,28055.949296,28.261972,19.008451,64.479155,0.63662,0.32629,46.729577,0.515493,645.392445
std,27988.870479,12.308492,11.121437,9.142118,0.481652,0.245535,21.703448,0.500465,479.674336
min,3107.0,2.0,1.0,35.1,0.0,-0.272727,2.0,0.0,30.0
25%,3558.0,19.0,11.0,58.4,0.0,0.154701,30.0,0.0,331.2
50%,4082.0,27.0,18.0,65.5,1.0,0.3,51.0,1.0,535.0
75%,50028.0,36.0,25.0,71.5,1.0,0.492424,56.5,1.0,820.0
max,149511.0,62.0,64.0,86.3,1.0,0.956522,112.0,1.0,5140.45


In [15]:
# 以下为影像文件路径相关

In [16]:
# Add root
data['IMG_REL_PATH'] = '..' + os.sep + 't1raw' + os.sep + data['IMG_REL_PATH']

In [17]:
# Extract nii file by path
import shutil
def move_nii(rec):
    orig_path = rec.IMG_REL_PATH
    orig_path = os.path.join(orig_path, os.listdir(rec.IMG_REL_PATH)[0])
    dest_path = os.path.join('..', 't1', str(rec.PATNO)+str(rec.EVENT_ID)+str(rec.IMG_ID))
    #os.mkdir(dest_path)
    dest_path = os.path.join(dest_path, 't1.nii')
    #shutil.copyfile(orig_path, dest_path)
    return dest_path

In [18]:
data['NII_PATH'] = data.apply(move_nii, axis=1)

In [19]:
# Remove 't1.nii' to get IMG_ROOT
data['IMG_ROOT'] = data['NII_PATH'].apply(lambda x: x[:-6])

In [20]:
# Matlab code: M1Segmentation.m

In [21]:
# Preprocessed image path
data['T1_MNI_PATH'] = data['IMG_ROOT'] + 'mri' + os.sep + 'wmt1.nii'
data['T1_GM_PATH'] = data['IMG_ROOT'] + 'mri' + os.sep + 'mwp1t1.nii'

In [22]:
# Matlab code: M2Smooth.m

In [23]:
data['T1_SGM_PATH'] = data['IMG_ROOT'] + 'mri' + os.sep + 'smwp1t1.nii'

In [24]:
# IQR check
iqr_list = []
from xml.dom import minidom
report_list = list(data['IMG_ROOT']+ os.sep + 'report' + os.sep + 'cat_t1.xml')
for report in report_list:
    root = minidom.parse(report).documentElement
    iqr_str = root.getElementsByTagName('catlog')[0].getElementsByTagName('item')[-5].childNodes[0].data
    iqr_str = iqr_str.split(' ')[4][:-1]
    iqr_list.append({'IQR': float(iqr_str)})
iqr_list = pd.DataFrame(iqr_list)
data = pd.concat([data, iqr_list], axis=1)
data = data[data['IQR'] >= 70]

In [25]:
# Write to json file
data['INFODT'] = data['INFODT'].astype(str) # Datetime cant be stored in json
data_json = data.to_dict(orient='records')
with open('data.json', 'w+') as f:
    json.dump(data_json, f, ensure_ascii=False, indent=4)

In [26]:
data

Unnamed: 0,PATNO,EVENT_ID,INFODT,NUPDR3OF,NUPDR3ON,IMG_ID,IMG_REL_PATH,AGE_AT_VISIT,SEX,ORIG_ENTRY,...,DURATION,CAT,KEY,LEDD,NII_PATH,IMG_ROOT,T1_MNI_PATH,T1_GM_PATH,T1_SGM_PATH,IQR
0,3107,V04,2012-03-01,2.0,2.0,I296431,../t1raw/3107/MPRAGE_GRAPPA/2012-03-28_10_35_2...,70.6,1,03/2011,...,14,0,3107V04I296431,760.0,../t1/3107V04I296431/t1.nii,../t1/3107V04I296431/,../t1/3107V04I296431/mri/wmt1.nii,../t1/3107V04I296431/mri/mwp1t1.nii,../t1/3107V04I296431/mri/smwp1t1.nii,78.43
1,3107,V06,2013-05-01,18.0,6.0,I378218,../t1raw/3107/MPRAGE_GRAPPA/2013-05-15_10_04_1...,71.7,1,03/2011,...,28,1,3107V06I378218,760.0,../t1/3107V06I378218/t1.nii,../t1/3107V06I378218/,../t1/3107V06I378218/mri/wmt1.nii,../t1/3107V06I378218/mri/mwp1t1.nii,../t1/3107V06I378218/mri/smwp1t1.nii,78.22
2,3107,V10,2015-05-01,19.0,6.0,I498876,../t1raw/3107/Sag_MPRAGE_GRAPPA/2015-05-08_09_...,73.7,1,03/2011,...,52,1,3107V10I498876,920.0,../t1/3107V10I498876/t1.nii,../t1/3107V10I498876/,../t1/3107V10I498876/mri/wmt1.nii,../t1/3107V10I498876/mri/mwp1t1.nii,../t1/3107V10I498876/mri/smwp1t1.nii,79.42
3,3108,V06,2013-04-01,13.0,15.0,I378222,../t1raw/3108/MPRAGE_GRAPPA/2013-04-24_10_04_3...,51.8,0,04/2011,...,25,0,3108V06I378222,600.0,../t1/3108V06I378222/t1.nii,../t1/3108V06I378222/,../t1/3108V06I378222/mri/wmt1.nii,../t1/3108V06I378222/mri/mwp1t1.nii,../t1/3108V06I378222/mri/smwp1t1.nii,78.86
4,3108,V10,2015-05-01,22.0,12.0,I498885,../t1raw/3108/Sag_MPRAGE_GRAPPA/2015-05-06_09_...,53.8,0,04/2011,...,50,1,3108V10I498885,600.0,../t1/3108V10I498885/t1.nii,../t1/3108V10I498885/,../t1/3108V10I498885/mri/wmt1.nii,../t1/3108V10I498885/mri/mwp1t1.nii,../t1/3108V10I498885/mri/smwp1t1.nii,80.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,101038,V04,2022-05-01,37.0,34.0,I1616556,../t1raw/101038/3D_T1/2022-06-09_16_58_22.0/S1...,65.5,1,04/2021,...,19,0,101038V04I1616556,50.0,../t1/101038V04I1616556/t1.nii,../t1/101038V04I1616556/,../t1/101038V04I1616556/mri/wmt1.nii,../t1/101038V04I1616556/mri/mwp1t1.nii,../t1/101038V04I1616556/mri/smwp1t1.nii,81.63
349,101175,V04,2022-05-01,53.0,40.0,I1582565,../t1raw/101175/3D_T1-weighted/2022-05-10_10_1...,72.0,1,04/2021,...,15,0,101175V04I1582565,510.0,../t1/101175V04I1582565/t1.nii,../t1/101175V04I1582565/,../t1/101175V04I1582565/mri/wmt1.nii,../t1/101175V04I1582565/mri/mwp1t1.nii,../t1/101175V04I1582565/mri/smwp1t1.nii,84.09
350,101179,V04,2022-04-01,45.0,25.0,I1571515,../t1raw/101179/3D_T1-weighted/2022-04-04_10_4...,45.0,0,03/2021,...,34,1,101179V04I1571515,1000.0,../t1/101179V04I1571515/t1.nii,../t1/101179V04I1571515/,../t1/101179V04I1571515/mri/wmt1.nii,../t1/101179V04I1571515/mri/mwp1t1.nii,../t1/101179V04I1571515/mri/smwp1t1.nii,84.97
352,101295,V04,2022-08-01,31.0,23.0,I1616170,"../t1raw/101295/T1-weighted,_3D_VOLUMETRIC/202...",66.3,1,04/2021,...,25,0,101295V04I1616170,225.0,../t1/101295V04I1616170/t1.nii,../t1/101295V04I1616170/,../t1/101295V04I1616170/mri/wmt1.nii,../t1/101295V04I1616170/mri/mwp1t1.nii,../t1/101295V04I1616170/mri/smwp1t1.nii,81.80
