In [1]:
#imports 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import urllib.parse
from glob import glob
import os
import ntpath 
import zipfile

#paths
#PPMI_zippath='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018.zip'
PPMI_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/'

In [2]:
def reading_csv_files(folder_path):
    # saving all the csv files in a dataframe
    import os
    import glob
    # glob.glob('*.csv') #find all the csv files in a pathname. 
    os.chdir(folder_path)
    csv_files = [i for i in glob.glob('*.csv')]

    # Reading each csv file and storing them in a dictionnary containing the file name and the dataframe
    dict_files={}
    files_names=[]
    for files in csv_files:
    #for i, files in enumerate(csv_files): 
        df=pd.read_csv(files,engine='python')
        basepath, filename=ntpath.split(files)
        files_names.append(filename)
        dict_files[filename]=df

    # Calculating the # of rows and features for each dataframe and storing them in a list of tuples
    shape_df=[]
    for i in range(0,len(dict_files.keys())):
        nrows, ncols=(len(dict_files[files_names[i]]),len(dict_files[files_names[i]].columns))
        shape_df.append((nrows,ncols))

    return (shape_df,files_names,dict_files)

In [3]:
def number_of_patients(df):
    '''return the number of patients in each csv files'''
    if 'PATNO' in df.columns:
        patients=len(list(set(df['PATNO'])))
        if 'EVENT_ID' in df.columns:
            events=len(list(set(df['EVENT_ID'])))
        else:
            events='no TS data'
        return (patients,events)
    else: 
        print('no PATNO features in df')

In [4]:
def features_selection(df, SEL):
    '''cleanier way to do features selection in each csv file'''
    return df_sel

In [34]:
def patients_selection(df, threshold=6):
    '''return a dataframe containing only the patients with number of visits > threshold'''
    if 'PATNO' in df.columns: 
        visits_number_by_pat=df.groupby('PATNO').size().sort_values(ascending=False)
        mask_sel=visits_number_by_pat>=threshold
        patients_sel=list(mask_sel[mask_sel.values==True].index)
        df=df.loc[df['PATNO'].isin(patients_sel),:]
        df=df.sort_values('PATNO')
        return (patients_sel, df)
    else:
        print("There is no 'PATNO' column in the dataframe")

In [None]:
def table_analysis(list_df,df_names,**args): 
    '''return a dataframe with: 
    -as rows: list of df
    -as columns: 
        - total number of observations=rows
        - total number of features=columns after features_selection - features_selection function 
        - number of events
        - number of PD patients by using PRODROMA info - number_of_patients function
        - number of patients with # of visits > threshold - number_of_patients function
        - shape of final df after selections of rows and columns
        -number of NaN values in the final df
    '''
    
# Creating a nested dictonary - initialisation with names of keys. 
keys_col=['observations_tot','features','events_num','PD_num','patients_sel','shape_df_sel','NaN_values']
keys_rows=df_names
dict_analysis=dict(zip(keys_rows,dict(zip(keys_col,list(np.zeros(7))))))

# remplissage dictionnary for each df: 
for i,df in enumerate(list_df): 
    # total number of observations


## 1-Subject-Characteristics

In [9]:
folder_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018/1-Subject-Characteristics/'
files_names=reading_csv_files(folder_path)[1]
dict_files=reading_csv_files(folder_path)[2]

### Patient_Status - actually useful for selecting only the PD patients in each table

PRODROMA=dict_files[files_names[0]]
print(number_of_patients(PRODROMA))
#print(list(set(PRODROMA['RECRUITMENT_CAT'])))
print(list(set(PRODROMA['ENROLL_CAT'])))
print(PRODROMA['ENROLL_CAT'].isnull().sum())
PD_cat=['PD','REGPD','GENPD']
mask=PRODROMA['ENROLL_CAT'].isin(PD_cat)
PD_patients=PRODROMA.loc[mask,'PATNO']
len(PD_patients)

### Screening__Demographics - useful
#### NOT TS DATA
SCREEN=dict_files[files_names[3]]
SEL=['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME',
       'CONSNTDT', 'APPRDX', 'CURRENT_APPRDX', 'P3GRP', 'BIRTHDT', 'GENDER',
       'HISPLAT', 'RAINDALS', 'RAASIAN', 'RABLACK', 'RAHAWOPI', 'RAWHITE',
       'RANOS', 'PRJENRDT']
SCREEN_sel=SCREEN[SEL]
print(number_of_patients(SCREEN))

(2135, 'no TS data')
[nan, 'REGPD', 'PRODROMA', 'PD', 'GENUN', 'REGUN', 'SWEDD', 'HC', 'GENPD']
331
(2144, 1)


## 2-Biospecimen

In [10]:
folder_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018/2-Biospecimen/'
files_names=reading_csv_files(folder_path)[1]
dict_files=reading_csv_files(folder_path)[2]

### Genetic_Results_Tests - useful: Feat-bio - 1116 patients
#### NOT TS DATA

MUTRSLT=dict_files[files_names[3]]
SEL=['REC_ID','PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'GENECAT', 'LRRKCD', 'MUTRSLT']
MUTRSLT_sel=MUTRSLT[SEL]
print(number_of_patients(MUTRSLT))
MUTRSLT_sel.head()

### Pilot_Biospecimen_Analysis_Results - useful: Feat-bio
#### Understand if the cohort of patient IDs are different from the classic PPMI cohort.
#### NOT TS DATA.

### Pilot_Biospecimen_Analysis_Results - useful: Feat-bio
Biospecimen_Analysis=dict_files[files_names[6]]

SEL=['PATNO', 'DIAGNOSIS', 'CLINICAL_EVENT', 'TYPE', 'TESTNAME',
       'TESTVALUE','RUNDATE', 'PROJECTID']
print(number_of_patients(Biospecimen_Analysis))
Biospecimen_Analysis_sel=Biospecimen_Analysis[SEL]

### Lumbar_Puncture_Sample_Collection - useful: Feat-bio - 1344 patients
LUMBAR=dict_files[files_names[9]]

### Only features useful I think: white cell blood count, red blood cell count, total glucose, total protein.

SEL=['REC_ID','PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT', 'FASTSTAT','PDMEDYN', 'PDMEDDT', 'PDMEDTM',
     'SMPDSCRD','RBCRSLT','WBCRSLT','TOPRRSLT','TGLCRSLT','SPFI','FLUORO']
print(number_of_patients(LUMBAR))
#print(len(patients_selection(LUMBAR,6)[0]))
LUMBAR_sel=LUMBAR[SEL]

(1116, 1)
(664, 'no TS data')
(1344, 20)


## 3-Enrollment - useless?

In [14]:
folder_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018/3-Enrollment/'
files_names=reading_csv_files(folder_path)[1]
dict_files=reading_csv_files(folder_path)[2]

### 'Primary_Diagnosis.csv' - useful: Feat-Gen - 789 patients... (only PD patients?)
PRIMDXPD=dict_files[files_names[8]]


SEL=['REC_ID', 'F_STATUS', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'PRIMDIAG', 'OTHNEURO']
print(number_of_patients(PRIMDXPD))
PRIMDXPD_sel=PRIMDXPD[SEL]

(789, 15)


## 5-Medical History

In [13]:
folder_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018/5-Medical-History/'
files_names=reading_csv_files(folder_path)[1]
dict_files=reading_csv_files(folder_path)[2]

### 'Diagnostic_Features.csv': useful: Feat-PD - 1670 patients.

DIAGFEAT=dict_files[files_names[1]]

SEL=['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'DFSTROKE', 'DFRSKFCT', 'DFPRESNT', 'DFRPROG', 'DFSTATIC', 'DFHEMPRK',
       'DFAGESX', 'DFOTHCRS', 'DFCRSCM', 'DFRTREMP', 'DFRTREMA', 'DFPATREM',
       'DFOTHTRM', 'DFTREMCM', 'DFRIGIDP', 'DFRIGIDA', 'DFAXRIG', 'DFUNIRIG',
       'DFTONE', 'DFOTHRIG', 'DFRIGCM', 'DFBRADYP', 'DFBRADYA', 'DFAKINES',
       'DFBRPLUS', 'DFOTHABR', 'DFABRCM', 'DFPGDIST', 'DFGAIT', 'DFFREEZ',
       'DFFALLS', 'DFOTHPG', 'DFPGCM', 'DFPSYCH', 'DFCOGNIT', 'DFDYSTON',
       'DFCHOREA', 'DFMYOCLO', 'DFOTHHYP', 'DFHYPCM', 'DFHEMTRO', 'DFPSHYPO',
       'DFSEXDYS', 'DFURDYS', 'DFBWLDYS', 'DFOCULO', 'DFEYELID', 'DFNEURAB',
       'DFDOPRSP', 'DFRAPSPE', 'DFBULBAR', 'DFCTSCAN', 'DFMRI', 'DFATYP']
print(number_of_patients(DIAGFEAT))
DIAGFEAT_sel=DIAGFEAT[SEL]

### 'General_Medical_History.csv': useful: Feat-Gen - 2067 patients
#### NOT TS DATA...

MHXGEN=dict_files[files_names[3]]
SEL=['REC_ID', 'F_STATUS', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'MHCAT', 'MHROW', 'MHHX', 'MHACTRES', 'MHDIAGYR', 
     'PT_CODE', 'PT_NAME', 'HLT_NAME', 'SOCABBR1']
# I need probably to select less features here
print(number_of_patients(MHXGEN))
MHXGEN_sel=MHXGEN[SEL]

### General_Physical_Exam: useful: Feat-Gen - 2047 patients

GENPHYEX=dict_files[files_names[5]]

SEL=['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'PESEQ', 'PECAT', 'ABNORM']
print(number_of_patients(GENPHYEX))
GENPHYEX_sel=GENPHYEX[SEL]

### 'General_Neurological_Exam.csv': useful: Feat-Gen - 2048 patients

PENEURO=dict_files[files_names[7]]

SEL=['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'MSRARSP', 'MSRACM', 'MSLARSP', 'MSLACM', 'MSRLRSP', 'MSRLCM',
       'MSLLRSP', 'MSLLCM', 'COFNRRSP', 'COFNRCM', 'COFNLRSP', 'COFNLCM',
       'COHSRRSP', 'COHSRCM', 'COHSLRSP', 'COHSLCM', 'SENRARSP', 'SENRACM',
       'SENLARSP', 'SENLACM', 'SENRLRSP', 'SENRLCM', 'SENLLRSP', 'SENLLCM',
       'RFLRARSP', 'RFLRACM', 'RFLLARSP', 'RFLLACM', 'RFLRLRSP', 'RFLRLCM',
       'RFLLLRSP', 'RFLLLCM', 'PLRRRSP', 'PLRRCM', 'PLRLRSP', 'PLRLCM']
print(number_of_patients(PENEURO))
PENEURO_sel=PENEURO[SEL]
PENEURO_sel.head()

### 'Neurological_Exam_-_Cranial_Nerves.csv': useful: Feat-PD - 2049 patients

PENEURO2=dict_files[files_names[13]]
PENEURO2.head()
PENEURO2.columns

SEL=['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'CN1RSP','CN2RSP', 'CN346RSP','CN5RSP',
     'CN7RSP', 'CN8RSP', 'CN910RSP',
       'CN11RSP', 'CN12RSP']
print(number_of_patients(PENEURO2))
PENEURO2_sel=PENEURO2[SEL]

### 'PD_Features.csv': useful: Feat-PD - caution: these are actually not timeseries but examination only done at screening (SC), BL & V04
### 1017 patients

PDFEAT=dict_files[files_names[14]]
#PDFEAT.head()
#PDFEAT.columns

SEL=['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT', 'SXMO',
       'SXYEAR', 'PDDXDT', 'DXTREMOR', 'DXRIGID', 'DXBRADY',
       'DXPOSINS', 'DXOTHSX', 'DXOTHCM', 'DOMSIDE']
print(number_of_patients(PDFEAT))
PDFEAT_sel=PDFEAT[SEL]
#list(set(PDFEAT_sel['DXOTHCM'])) - see if we remove the comment feature or not.

### 'Vital_Signs.csv': useful: Feat-Gen
### 2047 patients

VITAL=dict_files[files_names[15]]

SEL=['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'WGTKG', 'HTCM', 'TEMPC', 'SYSSUP', 'DIASUP', 'HRSUP',
       'SYSSTND', 'DIASTND', 'HRSTND']
print(number_of_patients(VITAL))
VITAL_sel=VITAL[SEL]

(1670, 19)
(2067, 2)
(2047, 8)
(2048, 15)
(2049, 15)
(1017, 3)
(2047, 21)


## 6-Motor-Assessments - all of them can be useful. 
#### But we might make the choice to not take all of them. 
#### For assessing the disease state (the labels), I think we should focus on the UPDRS1,3,&4 and take the other assessments as features

In [15]:
folder_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018/6-Motor-Assessments/'
files_names=reading_csv_files(folder_path)[1]
dict_files=reading_csv_files(folder_path)[2]

### 'MDS_UPDRS_Part_I__Patient_Questionnaire.csv' - 2011 patients

NUPDR1SP=dict_files[files_names[0]]
print(number_of_patients(NUPDR1SP))

### 'MDS_UPDRS_Part_I.csv' - 2010 patients

NUPDRS1=dict_files[files_names[2]]
print(number_of_patients(NUPDRS1))

### 'MDS_UPDRS_Part_II__Patient_Questionnaire.csv' - 2011 patients

NUPDRS2P=dict_files[files_names[11]]
print(number_of_patients(NUPDRS2P))

### 'MDS_UPDRS_Part_III.csv' - 2010 patients

NUPDRS3=dict_files[files_names[7]]
print(number_of_patients(NUPDRS3))

### 'MDS_UPDRS_Part_IV.csv' - 860 patients

NUPDRS4=dict_files[files_names[5]]
print(number_of_patients(NUPDRS4))

### 'PASE_-_Household_Activity.csv' - 1248 patients

PASE=dict_files[files_names[3]]
print(number_of_patients(PASE))

### 'PASE_-_Leisure_Time_Activity.csv' - 1248 patients

PASELT=dict_files[files_names[10]]
print(number_of_patients(PASELT))

### 'Modified_Schwab_+_England_ADL.csv' - 1837 patients

MODSEADL=dict_files[files_names[6]]
print(number_of_patients(MODSEADL))

(2011, 20)
(2010, 20)
(2011, 20)
(2010, 20)
(860, 19)
(1248, 13)
(1248, 13)
(1837, 20)


## 7-Non-motor-assessments - all of them can be useful but we might need to choose. 
### Do a correlation analysis and ask doctors which ones are the more relevant. 

In [17]:
folder_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018/7-Non-motor-assessments/'
files_names=reading_csv_files(folder_path)[1]
dict_files=reading_csv_files(folder_path)[2]

### 'University_of_Pennsylvania_Smell_ID_Test.csv' - 1799 patients

UPSIT=dict_files[files_names[0]]
#print(number_of_patients(UPSIT))
UPSIT.head()

### 'Features_of_REM_Behavior_Disorder.csv' - only 80 patients...
#### NOT TS data.

REMBHVDS=dict_files[files_names[1]]
#print(number_of_patients(REMBHVDS))
REMBHVDS.head()

### 'Epworth_Sleepiness_Scale.csv - 1343 patients.

EPWORTH=dict_files[files_names[2]]
#print(number_of_patients(EPWORTH))
EPWORTH.head()

### 'Letter_-_Number_Sequencing__PD_.csv' - 1341 patients.

LNSPD=dict_files[files_names[3]]
#print(number_of_patients(LNSPD))
LNSPD.head()

### 'Olfactory_UPSIT.csv' - 4468 patients
#### NB: only take the 2 last columns as features (TOTAL_CORRECT, UPSIIT_PRCNTGE)

OLFACT=dict_files[files_names[4]]
#print(number_of_patients(OLFACT))
#print(len(list(set(OLFACT['SUBJECT_ID']))))
OLFACT.head()

### 'REM_Sleep_Disorder_Questionnaire.csv' - 1810 patients

REMSLEEP=dict_files[files_names[5]]
#print(number_of_patients(REMSLEEP))
REMSLEEP.head()

### 'Cognitive_Assessments.csv' - 1172 patients

COGTIME=dict_files[files_names[6]]
#print(number_of_patients(COGTIME))
COGTIME.head()

### 'SCOPA-AUT.csv' - 1345 patients

SCOPAAUT=dict_files[files_names[7]]
#print(number_of_patients(SCOPAAUT))
SCOPAAUT.head()

### Cognitive_Categorization.csv' - 1302 patients

COGCATG=dict_files[files_names[8]]
#print(number_of_patients(COGCATG))
COGCATG.head()

### 'Hopkins_Verbal_Learning_Test.csv' - 1341 patients

HVLT=dict_files[files_names[9]]
#print(number_of_patients(HVLT))
HVLT.head()

### 'Montreal_Cognitive_Assessment__MoCA_.csv' - 2054 patients

MOCA=dict_files[files_names[10]]
#print(number_of_patients(MOCA))
MOCA.head()

### 'Semantic_Fluency.csv' - 1342 patients

SFT=dict_files[files_names[11]]
#print(number_of_patients(SFT))
SFT.head()

### 'Geriatric_Depression_Scale__Short_.csv' - 1478 patients

GDSSHORT=dict_files[files_names[12]]
#print(number_of_patients(GDSSHORT))
GDSSHORT.head()

### 'Symbol_Digit_Modalities.csv' - 1340 patients

SDM=dict_files[files_names[13]]
#print(number_of_patients(SDM))
SDM.head()

### 'State-Trait_Anxiety_Inventory.csv' - 1474 patients

STAI=dict_files[files_names[14]]
#print(number_of_patients(STAI))
STAI.head()

### 'Benton_Judgment_of_Line_Orientation.csv' - 1341 patients

BENTONOD=dict_files[files_names[15]]
#print(number_of_patients(BENTONOD))
BENTONOD.head()

### 'QUIP_Current_Short.csv' - 1346 patients

QUIPCS=dict_files[files_names[16]]
#print(number_of_patients(QUIPCS))



In [35]:
list_df=[LUMBAR,Biospecimen_Analysis,
PRIMDXPD,DIAGFEAT,MHXGEN,GENPHYEX,PENEURO,PENEURO2,VITAL,
NUPDRS1,NUPDRS2P,NUPDRS3,NUPDRS4,PASE,PASELT,MODSEADL,
UPSIT,REMBHVDS,EPWORTH,LNSPD,OLFACT,REMSLEEP,COGTIME,SCOPAAUT,COGCATG,HVLT,
MOCA,SFT,GDSSHORT,SDM,STAI,BENTONOD,QUIPCS]
df_names=['LUMBAR','Biospecimen_Analysis',
'PRIMDXPD','DIAGFEAT','MHXGEN','GENPHYEX','PENEURO','PENEURO2','VITAL',
'NUPDRS1','NUPDRS2P','NUPDRS3','NUPDRS4','PASE','PASELT','MODSEADL',
'UPSIT','REMBHVDS','EPWORTH','LNSPD','OLFACT','REMSLEEP','COGTIME','SCOPAAUT','COGCATG','HVLT',
'MOCA','SFT','GDSSHORT','SDM','STAI','BENTONOD','QUIPCS']

patients_sel_num=[]
dict_df_patientSel={}

for i,df in enumerate(list_df): 
    #print(number_of_patients(df))
    if 'PATNO' in df.columns:
        dict_df[df_names[i]]=len(patients_selection(df)[0])
dict_df

{'PRODROMA': 0,
 'SCREEN': 0,
 'MUTRSLT': 0,
 'LUMBAR': 531,
 'Biospecimen_Analysis': 107,
 'PRIMDXPD': 447,
 'DIAGFEAT': 320,
 'MHXGEN': 2067,
 'GENPHYEX': 2047,
 'PENEURO': 440,
 'PENEURO2': 440,
 'PDFEAT': 0,
 'VITAL': 910,
 'NUPDRS1': 856,
 'NUPDRS2P': 857,
 'NUPDRS3': 896,
 'NUPDRS4': 405,
 'PASE': 44,
 'PASELT': 1248,
 'MODSEADL': 717,
 'UPSIT': 0,
 'REMBHVDS': 0,
 'EPWORTH': 529,
 'LNSPD': 433,
 'REMSLEEP': 532,
 'COGTIME': 0,
 'SCOPAAUT': 530,
 'COGCATG': 265,
 'HVLT': 436,
 'MOCA': 433,
 'SFT': 433,
 'GDSSHORT': 563,
 'SDM': 436,
 'STAI': 561,
 'BENTONOD': 433,
 'QUIPCS': 531}

# Datasets to investigate

### Blood_Chemistry__Hematology - Contains information about patient blood. 
### Ask doctors if really useful. 

In [None]:
COVANCE=dict_files[files_names[12]]
#COVANCE.head()

### IUSM_Catalog - DNK - most likely useless
#### seems ti be only metadata on the lab collections.
Look @ the PPMI doc to understand what is this table

In [None]:
IUSMCAT=dict_files[files_names[14]]
#IUSMCAT.head()

# Additionnal data that will be used later on (imaging data and medication data)

## 4-Imaging - dataset not to be used for the time being.

In [None]:
folder_path='/Users/alicemartin/02_DSR_Project/parkinson-disease-project/data/PPMI-final-dataset-382018/4-Imaging/'
files_names=reading_csv_files(folder_path)[1]
dict_files=reading_csv_files(folder_path)[2]

### FBB_Analysis_Data.csv

In [None]:
FBBANAL=dict_files[files_names[0]]
#FBBANAL.head()

### AV-133_Image_Metadata.csv

In [None]:
AV133META=dict_files[files_names[1]]
#AV133META.head()

### AV-133_Imaging.csv

In [None]:
AVIMAG=dict_files[files_names[2]]
#AVIMAG.head()

### DaTscan_Imaging.csv

In [None]:
DATSCAN=dict_files[files_names[3]]
#DATSCAN.head()

### SPECT_Scan_Information_Source_Document.csv

In [None]:
SPECTSCANINFO=dict_files[files_names[4]]
#SPECTSCANINFO.head()

### AV-133_SBR_Results.csv

In [None]:
AV133SBRRSLT=dict_files[files_names[5]]
#AV133SBRRSLT.head()

### Magnetic_Resonance_Imaging.csv

In [None]:
MRI=dict_files[files_names[6]]
#MRI.head()

### DTI_Regions_of_Interest

In [None]:
DTIROI=dict_files[files_names[7]]
#DTIROI.head()

### DaTSCAN_SPECT_Visual_Interpretation_Assessment.csv

In [None]:
FBBMETA=dict_files[files_names[8]]
#FBBMETA.head()

### 'FBB_Metadata.csv'

In [None]:
FBBMETA=dict_files[files_names[9]]
#FBBMETA.head()

### 'MRI_Imaging_Data_Transfer_Information_Source_Document.csv'

In [None]:
MRIINFO=dict_files[files_names[10]]
#MRIINFO.head()

## Medications data & misc data - to be used later on.

### Family_History_PD - most likely useful but not to be used for now.
#### Can be useful for detecting early PD

In [None]:
FAMHXPD=dict_files[files_names[2]]
#FAMHXPD.head()

### 'Prodromal_Diagnostic_Questionnaire.csv': useful: Feat-Gen (PRIMDIAG & PSLVL2)
#### NB: PSLV2 evaluates the proba of the patient being PD: might be useful for early PD. 
#### But not to be used in the main analysis as a feature (only for Prodromal - i.e 800 subjects)

In [None]:
PRODIAG=dict_files[files_names[0]]
#PRODIAG.head()
#PRODIAG.columns

SEL=['REC_ID', 'F_STATUS', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'PRIMDIAG',
       'OTHNEURO', 'PSLVL2']
print(number_of_patients(PRODIAG))
PRODIAG_sel=PRODIAG[SEL]
PRODIAG_sel.head()

### 'Signature_Form.csv' - most likely useful: Meds - but not to be used for now.
#### Can be useful for the meds log

In [None]:
SIG=dict_files[files_names[10]]
#SIG.head()

### Use of PD Medication: useful: Meds - but not to be used for now.

In [None]:
PDMEDUSE=dict_files[files_names[4]]
#PDMEDUSE.head()

### 'Concomitant_Medications.csv': useful: Meds - But not to be used for now.

In [None]:
CMED=dict_files[files_names[6]]
#CMED.head()

### 'Current_Medical_Conditions_Log.csv': useful: Meds - but not to be used for now. 

In [None]:
### 'Current_Medical_Conditions_Log.csv': Meds
CURRCOND=dict_files[files_names[8]]
#CURRCOND.head()

### 'Adverse_Event_Log.csv': useful: Meds - but not to be used for now.
#### Measures the side effects of the meds taken for PD.

In [None]:
LOG=dict_files[files_names[9]]
#LOG.head()

### 'Surgery_for_Parkinson_Disease.csv': useful: Meds - but not to be used for now.

In [None]:
SURGPD=dict_files[files_names[12]]
#SURGPD.head()

## Datasets with two few data points (not enough patients)

### Skin_Biopsy - useful: Feat-bio (wound closure) - only 26 patients...

In [None]:
SKBIO=dict_files[files_names[1]]
#SKBIO.head()
#SKBIO.columns

SEL=['REC_ID','PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT','SKBIOCMP','WOUNDCLS']
SKBIO_sel=SKBIO[SEL]
print(number_of_patients(SKBIO))
SKBIO_sel.head()
#list(set(SKBIO_sel['EVENT_ID']))

### IPSC_Labs: useful - Feat-bio: blood, liver disease, heparin(?), myeloma(?) stuff (linked to Med dose history)
### ONLY 137 patients...

In [None]:
### ISPC_Labs: useful - Feat-bio
#### Select only history of liver disease and multiple myeloma (can be useful for treatments strategies)
IPSCLAB=dict_files[files_names[5]]
#IPSCLAB.head()
IPSCLAB.columns

SEL=['REC_ID','PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'PDMEDYN', 'PDMEDDT', 'PDMEDTM', 'WARFBEF', 'HEPBEF', 'HXLIVER',
       'HXMMYLMA', 'BLDLTH', 'LTHTM', 'LTHINVN', 'BLDSST', 'BLDCPT']
# we might be able to remove some features. 
print(number_of_patients(IPSCLAB))
IPSCLAB_sel=IPSCLAB[SEL]
IPSCLAB_sel.head()
#list(set(IPSCLAB_sel['EVENT_ID']))

### 'Clinical_Diagnosis_and_Management.csv': Feat-PD - only 73 patients...

In [None]:
### 'Clinical_Diagnosis_and_Management.csv': Feat-PD
CLINDX=dict_files[files_names[2]]
CLINDX.head()
#CLINDX.columns

SEL=['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT',
       'PSLVL', 'DCNOMTR', 'DCRTREM', 'DCRIGID', 'DCBRADY', 'DFPGDIST',
       'DFOTH', 'PRIMDIAG', 'OTHNEURO', 'CLDXCHNG', 'DXFDTIMG',
       'DXFCLSGN', 'DXFRSPMD', 'DXFNATHX', 'DXFOTH', 'DXFOTHCM', 'MNGCHNG',
       'MNGTRTSX', 'MNGTRIAL', 'MNGTRTOT', 'MNGDXTST', 'MNGNOTRT',
       'NEURWKDX']
print(number_of_patients(CLINDX))
CLINDX_sel=CLINDX[SEL]
CLINDX_sel.head()
#list(set(CLINDX_sel['EVENT_ID']))

### 'TAP-PD_OPDM_Assessment.csv' - only 173 patients...


In [None]:
TAPOPDM=dict_files[files_names[8]]
print(number_of_patients(TAPOPDM))
TAPOPDM.head()

### 'TAP-PD_OPDM_Use_Questionnaire.csv' - only 32 patients

In [None]:
TAPUSE=dict_files[files_names[1]]
print(number_of_patients(TAPUSE))
TAPUSE.head()

### 'Gait_Data___Arm_swing.csv' - not sure if we should consider this dataset. - only 62 patients...

In [None]:
GAITDATA=dict_files[files_names[4]]
print(number_of_patients(GAITDATA))
GAITDATA.head()

### 'TAP-PD_Kinetics_Device_Testing.csv' - Idem for this one- not sure it is essential to consider it.
#### NO TS data. 
### Only 32 patients...

In [None]:
KINETICS=dict_files[files_names[9]]
print(number_of_patients(KINETICS))
KINETICS.head()