# Import requirements

In [1]:
from __future__ import division
import glob
import pandas as pd
import numpy as np
import itertools
from collections import Counter
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Load redcap data

In [2]:
def import_redcap_data(redcap_path):
    redcap_df = pd.read_csv(redcap_path, sep='\t')
    redcap_df.head()
    return redcap_df

def select_clinical_parameters(df, cols):
    clin_df = df[cols.keys()].rename(columns=cols)
    clin_df['DateOfVisit'] = pd.to_datetime(clin_df['DateOfVisit'])
    return clin_df

redcap_path = '/Users/greg/Desktop/Cohort/DrexelMedCARESCohort_DATA_LABELS_2015-10-06_1434.csv'
cols = {'PatientID':'Patient',
        'Patient visit number':'Visit',
        'Date of visit':'DateOfVisit',
        'Initial CD4 count (cells/uL)':'iCD4',
        'Nadir CD4 count (cells/uL)':'nCD4',
        'Latest CD4 count (cells/uL)':'CD4',
        'Initial CD8 count (cells/uL)': 'iCD8',
        'Nadir CD8 count (cells/uL)': 'nCD8',
        'Latest CD8 count (cells/uL)': 'CD8',
        'Initial viral load (copies/mL)': 'iVL',
        'Peak viral load (copies/mL)': 'pVL',
        'Latest viral load': 'VL',
        'Total Modified Hopkins Dementia Score': 'TMHDS',
        'Gender': 'Gender',
        'Age': 'Age',
        'Current ART status': 'ART'}

col_order = ['Patient', 'Visit', 'DateOfVisit', 'Age', 'Gender', 'ART',
             'VL', 'iVL', 'pVL', 'CD4', 'iCD4', 'nCD4', 'CD8', 'iCD8', 'nCD8',
             'TMHDS']


redcap_df = import_redcap_data(redcap_path)
redcap_df['PatientID'] = redcap_df['PatientID'].astype(str)
print 'REDCAP:', redcap_df.shape

clin_df = select_clinical_parameters(redcap_df, cols)
clin_df = clin_df[col_order]
print 'CLIN:', clin_df.shape
clin_df.head(10)

REDCAP: (3055, 433)
CLIN: (3055, 16)


  if self.run_code(code, result):


Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,iCD4,nCD4,CD8,iCD8,nCD8,TMHDS
0,A0001,R00,2006-09-12,51,Male,on,1515,3892,65000,384,611,301,1192,1421,,
1,A0001,R01,2007-08-15,52,Male,on,80,3892,65000,724,611,301,1423,1421,,
2,A0001,R02,2008-06-04,53,Male,on,80,3892,65000,573,611,301,1273,1421,1002.0,1.0
3,A0001,R03,2008-11-11,53,Male,on,<48,3892,65000,858,611,301,1736,1421,1002.0,5.0
4,A0001,R04,2009-11-10,54,Male,on,<48,3892,65000,689,611,301,1378,1421,1002.0,8.5
5,A0001,R05,2010-05-05,55,Male,on,<48,3892,65000,699,611,301,1060,1421,1002.0,6.0
6,A0001,R06,2010-12-14,55,Male,on,<48,3892,65000,681,611,301,1060,1421,1002.0,6.0
7,A0001,R07,2014-02-10,58,Male,on,33,987,987,742,400,400,1130,1168,1060.0,8.5
8,A0001,R08,2014-07-29,59,Male,on,< 20,987,987,987,400,400,1131,1164,1060.0,2.0
9,A0001,R09,2014-11-10,59,Male,on,< 20,987,987,797,400,400,1131,1164,1060.0,4.0


# Load GDS data

In [3]:
gds_path = '/Users/greg/Desktop/Cohort/Drexel HIV NP Data select standardized 072815 - fixed.xlsx'
GDS_df = pd.read_excel(gds_path, headline=1)
GDS_df['PatientID'] = GDS_df['PatientID'].astype(str)
print 'GDS:', GDS_df.shape
GDS_df.head()

GDS: (197, 20)


Unnamed: 0,PatientID,Visit,VisitDate,MMSE_CNNS_T,WRAT4Reading_Manual_T,WAIS4Information_Manual_T,WAIS3DigSymCoding_manual_T,GroovedPegDom_Heaton_T,GroovedPegNonDom_Heaton_T,TrailA_Heaton_T,TrailB_Heaton_T,LetterFluencyFL_SENAS_T,CategoryFluency_SENAS_T,ROCF_CNNS_T,WorkingMemory_SENAS_T,WordListLearning_SENAS_T,BVMTimmed_CNNS_T,BVMTdelay_CNNS_T,BVMTrecog_CNNS_T,GDS
0,A0001,1,2014-11-10,62,30.666667,33.333333,36.666667,46,37,45,47.0,42.576073,45.236022,50,51.183651,53.126821,41,19,21,0.583333
1,A0002,1,2013-10-22,24,38.0,30.0,30.0,27,28,37,39.0,43.489099,42.094484,26,27.864799,18.091191,22,19,19,2.5
2,A0004,1,2014-11-10,41,26.666667,46.666667,40.0,50,51,45,,49.947669,68.200091,37,50.12684,34.34356,44,42,32,0.272727
3,A0005,1,2013-12-12,52,45.333333,46.666667,30.0,33,41,46,48.0,51.693154,72.062357,63,51.201786,17.079819,44,47,19,0.75
4,A0008,1,2014-08-05,72,40.666667,43.333333,33.333333,44,44,48,53.0,42.026762,48.579639,39,40.980445,41.293033,35,33,30,0.5


# GDS visit corrections

In [4]:
visits = []
for idx, row in GDS_df.iterrows():
    NEURO_PATIENT = row['PatientID']
    NEURO_DATE = row['VisitDate']    
    
    clin_info = clin_df[clin_df.Patient == NEURO_PATIENT][['Patient', 'Visit', 'DateOfVisit']]
    clin_info2 = clin_info[clin_info.DateOfVisit == NEURO_DATE][['Patient', 'Visit', 'DateOfVisit']]
    if clin_info2.empty:
        #print NEURO_PATIENT, NEURO_DATE
        for a, b in clin_info.dropna().iterrows():
            days_difference = b.DateOfVisit - NEURO_DATE
            if abs(days_difference.days) < 7:
                visits.append(b.Visit)
                #print ''
    else:
        #print NEURO_PATIENT, NEURO_DATE
        visits.append(list(clin_info2['Visit'])[0])
        #print ''
        
GDS_df['Visit'] = visits
print GDS_df.shape
GDS_df[['PatientID', 'Visit', 'VisitDate', 'GDS']].head()

(197, 20)


Unnamed: 0,PatientID,Visit,VisitDate,GDS
0,A0001,R09,2014-11-10,0.583333
1,A0002,R11,2013-10-22,2.5
2,A0004,R10,2014-11-10,0.272727
3,A0005,R05,2013-12-12,0.75
4,A0008,R07,2014-08-05,0.5


# Merge Clinical information with full neuro data

In [5]:
fullclinical_df = pd.merge(clin_df,GDS_df,
                  left_on = ['Patient','Visit'],
                  right_on = ['PatientID','Visit'],
                  how = 'outer')

fullclinical_df.drop(['PatientID'], axis=1, inplace=True)
print fullclinical_df.shape
fullclinical_df.head()

(3055, 34)


Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,TrailB_Heaton_T,LetterFluencyFL_SENAS_T,CategoryFluency_SENAS_T,ROCF_CNNS_T,WorkingMemory_SENAS_T,WordListLearning_SENAS_T,BVMTimmed_CNNS_T,BVMTdelay_CNNS_T,BVMTrecog_CNNS_T,GDS
0,A0001,R00,2006-09-12,51,Male,on,1515,3892,65000,384,...,,,,,,,,,,
1,A0001,R01,2007-08-15,52,Male,on,80,3892,65000,724,...,,,,,,,,,,
2,A0001,R02,2008-06-04,53,Male,on,80,3892,65000,573,...,,,,,,,,,,
3,A0001,R03,2008-11-11,53,Male,on,<48,3892,65000,858,...,,,,,,,,,,
4,A0001,R04,2009-11-10,54,Male,on,<48,3892,65000,689,...,,,,,,,,,,


# Numerical value corrections

In [6]:
incorrect_columns = ['iCD4','nCD4','CD4','iCD8','nCD8','CD8','iVL','pVL','VL']

for colname in incorrect_columns:
    #print colname
    corrected_values = []
    for item in list(fullclinical_df[colname]):
        l = []
        if isinstance(item, float):
            corrected_values.append(item)
        elif isinstance(item, str):
            item = item.replace('<','')
            item = item.replace('>','')
            item = item.replace(',','')
            item = item.replace('?','')
            for t in item.split():
                try:
                    l.append(float(t))
                except ValueError:
                    #pass
                    l.append(np.nan)
                except IndexError:
                    l.append(np.nan)
            #print l, item
            corrected_values.append(l[0])
        else:
            print item, type(item)
    #print len(corrected_values)
    fullclinical_df[colname] = corrected_values

print 'Full Clinical:', fullclinical_df.shape
fullclinical_df.head()

Full Clinical: (3055, 34)


Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,TrailB_Heaton_T,LetterFluencyFL_SENAS_T,CategoryFluency_SENAS_T,ROCF_CNNS_T,WorkingMemory_SENAS_T,WordListLearning_SENAS_T,BVMTimmed_CNNS_T,BVMTdelay_CNNS_T,BVMTrecog_CNNS_T,GDS
0,A0001,R00,2006-09-12,51,Male,on,1515,3892,65000,384,...,,,,,,,,,,
1,A0001,R01,2007-08-15,52,Male,on,80,3892,65000,724,...,,,,,,,,,,
2,A0001,R02,2008-06-04,53,Male,on,80,3892,65000,573,...,,,,,,,,,,
3,A0001,R03,2008-11-11,53,Male,on,48,3892,65000,858,...,,,,,,,,,,
4,A0001,R04,2009-11-10,54,Male,on,48,3892,65000,689,...,,,,,,,,,,


In [7]:
fullclinical_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/full_clinical.csv', index=False)