# Abstract

<p>This notebook reads in the NGS data (all 101 positions of Tat) as well as the full clinical information containing all of the neuropsych scoring metrics</p>

# Import requirements

In [1]:
from __future__ import division
import glob
import pandas as pd
import numpy as np
import itertools
from collections import Counter
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Import sequence data

In [2]:
# NGS proportional abundance
seq_abundance_df = pd.read_csv('/Users/greg/Desktop/FullNeuroIllumina/NGSProfiles/NGSprofile.csv')
print seq_abundance_df.shape
seq_abundance_df.head()

(5809, 25)


Unnamed: 0,Patient,Visit,Prot,AAPos,Coverage,A,R,N,D,C,...,L,K,M,F,P,S,T,W,Y,V
0,A0001,R09,Tat1,1,38938,5.1e-05,0.002157,5.1e-05,7.7e-05,0.00208,...,0.006934,0.001695,0.565052,2.6e-05,0,0.000848,0.000128,0.416637,0,0.000205
1,A0010,R08,Tat1,1,171,0.0,0.0,0.0,0.0,0.0,...,0.005848,0.005848,0.584795,0.0,0,0.0,0.0,0.403509,0,0.0
2,A0013,R09,Tat1,1,602,0.0,0.001661,0.0,0.0,0.0,...,0.001661,0.004983,0.282392,0.0,0,0.001661,0.0,0.209302,0,0.0
3,A0019,R12,Tat1,1,1485,0.0,0.001347,0.000673,0.0,0.000673,...,0.008081,0.003367,0.526599,0.0,0,0.001347,0.000673,0.452525,0,0.0
4,A0026,R09,Tat1,1,334,0.0,0.002994,0.0,0.0,0.0,...,0.008982,0.0,0.58982,0.0,0,0.002994,0.0,0.389222,0,0.0


In [3]:
# NGS raw counts
seq_counts_df = pd.read_csv('/Users/greg/Desktop/FullNeuroIllumina/NGSProfiles/NGScounts.csv')
print seq_counts_df.shape
seq_counts_df.head()

(5809, 25)


Unnamed: 0,Patient,Visit,Prot,AAPos,Coverage,A,R,N,D,C,...,L,K,M,F,P,S,T,W,Y,V
0,A0001,R09,Tat1,1,38938,2,84,2,3,81,...,270,66,22002,1,0,33,5,16223,0,8
1,A0010,R08,Tat1,1,171,0,0,0,0,0,...,1,1,100,0,0,0,0,69,0,0
2,A0013,R09,Tat1,1,602,0,1,0,0,0,...,1,3,170,0,0,1,0,126,0,0
3,A0019,R12,Tat1,1,1485,0,2,1,0,1,...,12,5,782,0,0,2,1,672,0,0
4,A0026,R09,Tat1,1,334,0,1,0,0,0,...,3,0,197,0,0,1,0,130,0,0


# Import clinical data

In [4]:
clinical_df = pd.read_csv('/Users/greg/Desktop/FullNeuroIllumina/Clinical/full_clinical.csv')
print clinical_df.shape
clinical_df.head()

(3055, 34)


Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,TrailB_Heaton_T,LetterFluencyFL_SENAS_T,CategoryFluency_SENAS_T,ROCF_CNNS_T,WorkingMemory_SENAS_T,WordListLearning_SENAS_T,BVMTimmed_CNNS_T,BVMTdelay_CNNS_T,BVMTrecog_CNNS_T,GDS
0,A0001,R00,2006-09-12,51,Male,on,1515,3892,65000,384,...,,,,,,,,,,
1,A0001,R01,2007-08-15,52,Male,on,80,3892,65000,724,...,,,,,,,,,,
2,A0001,R02,2008-06-04,53,Male,on,80,3892,65000,573,...,,,,,,,,,,
3,A0001,R03,2008-11-11,53,Male,on,48,3892,65000,858,...,,,,,,,,,,
4,A0001,R04,2009-11-10,54,Male,on,48,3892,65000,689,...,,,,,,,,,,


# Merge NGS and Clinical data

In [5]:
# relative proportion data
merged_df1 = pd.merge(clinical_df, seq_abundance_df,
                  left_on = ['Patient','Visit'],
                  right_on = ['Patient','Visit'],
                  how = 'inner')
merged_df1.sort_values(['AAPos','Patient','Visit'],  inplace=True)
print merged_df1.shape

# sequence counts data
merged_df2 = pd.merge(clinical_df, seq_counts_df,
                  left_on = ['Patient','Visit'],
                  right_on = ['Patient','Visit'],
                  how = 'inner')
merged_df2.sort_values(['AAPos','Patient','Visit'],  inplace=True)
print merged_df2.shape

(5809, 57)
(5809, 57)


In [6]:
GDSmerged_df = merged_df1.dropna(axis=0, subset=['GDS'])
TMHDSmerged_df = merged_df1.dropna(axis=0, subset=['TMHDS'])
TMHDSGDSmerged_df = TMHDSmerged_df.dropna(axis=0, subset=['GDS'])
print GDSmerged_df.shape
print TMHDSmerged_df.shape
print TMHDSGDSmerged_df.shape

(5608, 57)
(5307, 57)
(5307, 57)


In [7]:
GDSmerged_counts_df = merged_df2.dropna(axis=0, subset=['GDS'])
TMHDSmerged_counts_df = merged_df2.dropna(axis=0, subset=['TMHDS'])
TMHDSGDSmerged_counts_df = TMHDSmerged_counts_df.dropna(axis=0, subset=['GDS'])
print GDSmerged_counts_df.shape
print TMHDSmerged_counts_df.shape
print TMHDSGDSmerged_counts_df.shape

(5608, 57)
(5307, 57)
(5307, 57)


# Merge dataframes

In [8]:
# merged proportional data
GDSmerged_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/MergedData/NGS_GDS_abundance.csv', index=False)
TMHDSmerged_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/MergedData/NGS_TMHDS_abundance.csv', index=False)
TMHDSGDSmerged_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/MergedData/NGS_BOTH_abundance.csv', index=False)

# merged raw counts data
GDSmerged_counts_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/MergedData/NGS_GDS_counts.csv', index=False)
TMHDSmerged_counts_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/MergedData/NGS_TMHDS_counts.csv', index=False)
TMHDSGDSmerged_counts_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/MergedData/NGS_BOTH_counts.csv', index=False)

In [None]:
#GDS_merged_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/GDS_only_merged.csv', index=False)
#merged_df.to_csv('/Users/greg/Desktop/FullNeuroIllumina/merged_data.csv', index=False)

# Survey data

In [11]:
def slice_position(df, pos):
    m = df['AAPos'] == pos
    sliced_df = df[m]
    return sliced_df

def tally_groups(dataframe, parameter, threshold):
    df = slice_position(dataframe, 1)
    a = sum(df[parameter] < threshold)
    b = sum(df[parameter] >= threshold)
    return (a,b)

In [12]:
DFs = [GDSmerged_df, TMHDSmerged_df, TMHDSGDSmerged_df,
       GDSmerged_counts_df, TMHDSmerged_counts_df,
       TMHDSGDSmerged_counts_df]

for i in range(1,72+1):
    for g in DFs:
        (a,b) = tally_groups(g, 'GDS', 0.5)
        print((a,b)),
    print '\n'

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25, 28) 

(27, 29) (25, 28) (25, 28) (27, 29) (25, 28) (25