# Import requirements

In [200]:
from __future__ import division
import glob
import math
import pandas as pd
import numpy as np
from itertools import combinations
from collections import Counter
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Import Data

In [25]:
DF = pd.read_csv('/Users/greg/Desktop/FullNeuroIllumina/merged_data.csv')
print DF.shape
DF.head()

(3900, 57)


Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,L,K,M,F,P,S,T,W,Y,V
0,A0001,R09,2014-11-10,59,Male,on,20,987,987,797,...,0.006934,0.001695,0.565052,2.6e-05,0,0.000848,0.000128,0.416637,0,0.000205
1,A0010,R08,2014-11-12,59,Male,on,20,50,470,1167,...,0.005848,0.005848,0.584795,0.0,0,0.0,0.0,0.403509,0,0.0
2,A0013,R09,2014-11-10,68,Male,on,20,144,39373,771,...,0.001661,0.004983,0.282392,0.0,0,0.001661,0.0,0.209302,0,0.0
3,A0019,R12,2015-02-02,46,Female,on,20,99,164020,1130,...,0.008081,0.003367,0.526599,0.0,0,0.001347,0.000673,0.452525,0,0.0
4,A0026,R09,2015-04-30,55,Male,on,67859,100000,100000,137,...,0.008982,0.0,0.58982,0.0,0,0.002994,0.0,0.389222,0,0.0


# Functions

In [56]:
def GDS_split(df, parameter, threshold):
    mask1 = df[parameter] < threshold
    mask2 = df[parameter] >= threshold
    df1 = df[mask1]
    df2 = df[mask2]
    return df1, df2

def reformat_section(df):
    '''Must be a dataframe containing only 1 position'''
    pos = str(int(list(df.AAPos)[0]))
    AAs = ['A','R','N','D','C','Q','E','G','H','I',
           'L','K','M','F','P','S','T','W','Y','V']
    newnames_dict = {}
    for item in AAs:
        newnames_dict[item]= pos+item
    df_renamed = df.rename(columns = newnames_dict)
    df_renamed.drop(['AAPos','Coverage'], axis=1, inplace=True)
    return df_renamed

def consensusCallsDF(df):
    consensus_dict = {'Position':[], 'Calls':[], 'Richness':[]}
    for i,group in df.groupby('AAPos'):
        calls = []
        for j, sample in group.iterrows():
            if max(sample[AAs]) > 0:
                sample_call = str(sample[AAs].idxmax())
                #sample_GDS = sample['GDS']
                calls.append(sample_call)
        #print int(i), dict(Counter(calls))
        consensus_dict['Position'].append(int(i))
        consensus_dict['Calls'].append(dict(Counter(calls)))
        consensus_dict['Richness'].append(len(Counter(calls)))
    consensusDF = pd.DataFrame(consensus_dict)
    return consensusDF

# Split into groups by GDS

In [29]:
DF1, DF2 = GDS_split(DF, 'GDS', 0.5)
print DF1.shape
print DF2.shape

(1908, 57)
(1992, 57)


In [57]:
conDF = consensusCallsDF(DF)
conDF1 = consensusCallsDF(DF1)
conDF2 = consensusCallsDF(DF2)
print conDF.shape
print conDF1.shape
print conDF2.shape

(72, 3)
(72, 3)
(72, 3)


In [40]:
pd.DataFrame(zip(conDF.Position, conDF.Richness, conDF1.Richness, conDF2.Richness)).head()

Unnamed: 0,0,1,2,3
0,1,5,4,4
1,2,7,4,7
2,3,1,1,1
3,4,2,1,2
4,5,2,2,1


# Consensus "Reconstruction"

In [61]:
DF.sort_values(by=['Patient','Visit']).head()

Unnamed: 0,Patient,Visit,DateOfVisit,Age,Gender,ART,VL,iVL,pVL,CD4,...,L,K,M,F,P,S,T,W,Y,V
0,A0001,R09,2014-11-10,59,Male,on,20,987,987,797,...,0.006934,0.001695,0.565052,2.6e-05,0.0,0.000848,0.000128,0.416637,0.0,0.000205
54,A0001,R09,2014-11-10,59,Male,on,20,987,987,797,...,5.1e-05,0.084858,7.6e-05,2.5e-05,0.000102,0.412977,0.000382,0.0,0.0,0.000662
108,A0001,R09,2014-11-10,59,Male,on,20,987,987,797,...,0.001792,0.001262,2.5e-05,0.0,0.58938,0.000833,0.001413,0.0,0.0,0.000101
163,A0001,R09,2014-11-10,59,Male,on,20,987,987,797,...,0.003712,0.003119,3.9e-05,0.0,0.0,0.000908,0.0,0.000197,0.001343,0.982468
218,A0001,R09,2014-11-10,59,Male,on,20,987,987,797,...,0.000271,0.000773,3.9e-05,7.7e-05,7.7e-05,0.001391,0.000618,0.0,0.001894,0.001546


In [209]:
d = {'Patient':[], 'Visit':[], 'GDS Sequence':[]}
for i, group in DF.groupby(['Patient','Visit']):
    s = ''
    for pos in range(1,73):  
        g = group[group['AAPos']==pos][AAs]
        if g.shape[0]:
            aa = g.transpose().idxmax().as_matrix()[0]
        else:
            aa = '-'
        s+=aa
    d['Patient'].append(i[0])
    d['Visit'].append(i[1])
    d['GDS Sequence'].append(s)
reconstructionDF = pd.DataFrame(d)
print reconstructionDF.shape
reconstructionDF.head()

(55, 3)


Unnamed: 0,GDS Sequence,Patient,Visit
0,MEPVDPSLEPWKHPGSQPRTPCTACYCKKCCFHCQVCFTRKGLGIS...,A0001,R09
1,MEPVDPRLEPWEHPGSQPKTACNNCYCKGCCYHCQVCFITKGLGIS...,A0010,R08
2,IEPVNPKLKPWKHPGSQPKTACTNCYYKKCCFHCQVCFITKGLGIS...,A0013,R09
3,MEPVDPSLEPWKHPGSQPKTACNSCYCKKCCFHCQVCFTTKGLGIS...,A0019,R12
4,MEPVDPRLEPWKHPGSQPRTACNNCYCKKCCFHCQVCFTTKGLGIS...,A0026,R09


# Sanger data

In [49]:
sanger_df = pd.read_csv('/Users/greg/Desktop/TMHDSNeuroTat/all_Tat_data_df_2015_12_10.csv')
sanger_df.head()

Unnamed: 0,PatientID,VisitNum,DateOfVisit,Age,YS,Gender,Tissue,Protocol,iCD4,nCD4,...,Pos92,Pos93,Pos94,Pos95,Pos96,Pos97,Pos98,Pos99,Pos100,Pos101
0,A0001,R00,2006-09-12,51,10.767123,Male,,,611,301,...,,,,,,,,,,
1,A0001,R01,2007-08-15,52,11.690411,Male,PBMC,Genomic,611,301,...,-,-,-,-,-,-,-,-,-,-
2,A0001,R02,2008-06-04,53,12.49589,Male,PBMC,Genomic,611,301,...,-,-,-,-,-,-,-,-,-,-
3,A0001,R03,2008-11-11,53,12.934247,Male,PBMC,Genomic,611,301,...,E,R,E,T,E,T,D,P,V,D
4,A0001,R04,2009-11-10,54,13.931507,Male,PBMC,Genomic,611,301,...,E,R,E,T,E,T,D,P,F,D


# Merge Illumina and Sanger sequence data

In [128]:
seqcompare_df = pd.merge(reconstructionDF, sanger_df,
                left_on=['Patient','Visit'],
                right_on=['PatientID','VisitNum'],
                how='inner')

In [137]:
seqcompare_df.head()

Unnamed: 0,GDS Sequence,Patient,Visit,PatientID,VisitNum,DateOfVisit,Age,YS,Gender,Tissue,...,Pos92,Pos93,Pos94,Pos95,Pos96,Pos97,Pos98,Pos99,Pos100,Pos101
0,MEPVDPSLEPWKHPGSQPRTPCTACYCKKCCFHCQVCFTRKGLGIS...,A0001,R09,A0001,R09,2014-11-10,59,18.934247,Male,PBMC,...,-,-,-,-,-,-,-,-,-,-
1,MEPVDPRLEPWEHPGSQPKTACNNCYCKGCCYHCQVCFITKGLGIS...,A0010,R08,A0010,R08,2014-11-12,59,22.942466,Male,PBMC,...,-,-,-,-,-,-,-,-,-,-
2,IEPVNPKLKPWKHPGSQPKTACTNCYYKKCCFHCQVCFITKGLGIS...,A0013,R09,A0013,R09,2014-11-10,68,16.931507,Male,PBMC,...,-,-,-,-,-,-,-,-,-,-
3,MEPVDPSLEPWKHPGSQPKTACNSCYCKKCCFHCQVCFTTKGLGIS...,A0019,R12,A0019,R12,2015-02-02,46,15.161644,Female,,...,,,,,,,,,,
4,MEPVDPRLEPWKHPGSQPRTACNNCYCKKCCFHCQVCFTTKGLGIS...,A0026,R09,A0026,R09,2015-04-30,55,32.410959,Male,,...,,,,,,,,,,


# Compare Illumina and Sanger sequences for matching samples

# make dataframe

In [182]:
d = {'Patient':[], 'Visit':[], 'Illumina':[], 'Sanger':[]}
for i, group in seqcompare_df.groupby(['Patient','Visit']):
    #print (i, group.shape)
    s_sanger = ''
    for j in np.arange(1,73):
        position_col = 'Pos%02i' % j
        aa = list(group[position_col])[0]
        if isinstance(aa, float):
            if math.isnan(aa):
                aa = '-'
        s_sanger+=aa
    s_illumina = list(group['GDS Sequence'])[0]
    
    denom = 0
    num = 0
    for pair in zip(s_sanger, s_illumina):
        if '-' not in pair:
            denom+=1
            if pair[0]==pair[1]:
                num+=1
    if denom>0:
        agreement = num/denom
        d['Patient'].append(i[0])
        d['Visit'].append(i[1])
        d['Sanger'].append(s_sanger)
        d['Illumina'].append(s_illumina)
colorder = ['Patient','Visit','Sanger','Illumina']
comp_df = pd.DataFrame(d)[colorder]
comp_df.head()

Unnamed: 0,Patient,Visit,Sanger,Illumina
0,A0001,R09,MESVDPRLEPWKHPGSQPQAPCTCYYHKKCCFNCQDCLFAKALGIS...,MEPVDPSLEPWKHPGSQPRTPCTACYCKKCCFHCQVCFTRKGLGIS...
1,A0010,R08,MEPVDPRLEPWKHPGSQPKAACTTCYCKKCCFHCQVCFITKALGIS...,MEPVDPRLEPWEHPGSQPKTACNNCYCKGCCYHCQVCFITKGLGIS...
2,A0013,R09,MESVDPRLEPWKHPGSQTKAACTICYCKKCCIHCQVCYIIKALGIS...,IEPVNPKLKPWKHPGSQPKTACTNCYYKKCCFHCQVCFITKGLGIS...
3,A0034,R04,MESVDPRLEPWTHPGSQPKAACTYGYCKKCCLDCQECFSSKALGIS...,MEPVDPRLEPWKHPGSQPMTPCTNCYCKKCCFHCQVCFITKGLGIS...
4,A0044,R10,MESVDPRLEPWKHPGSQPQAACTTGYCKKCRLDCQVCLTSKALGIS...,MDPVDPKLEPWKHPGSQPRTACTKCYCKKCCFHCQVCFTTKALGIS...


# Agreement function

In [184]:
def calcSeqAgreement(seq1, seq2):
    denom = 0
    num = 0
    for pair in zip(seq1, seq2):
        if '-' not in pair:
            denom+=1
        if pair[0]==pair[1]:
            num+=1
    if denom>0:
        agreement = num/denom
    else:
        agreement = 0
    return (denom, num, agreement)

# Calculate agreement

In [210]:
sanger_seqs = list(comp_df['Sanger'])
illumina_seqs = list(comp_df['Illumina'])
z = zip(sanger_seqs, illumina_seqs)
scores = []
for pair in z:
    a = calcSeqAgreement(pair[0], pair[1])
    scores.append(a[2])
print np.mean(scores)
print np.std(scores)

0.78160233543
0.0664315716082


In [211]:
scores = []
for combo in combinations(sanger_seqs, 2):
    a = calcSeqAgreement(combo[0],combo[1])
    scores.append(a[2])
print np.mean(scores)
print np.std(scores)

0.855090036172
0.0537953455043


In [212]:
scores = []
for combo in combinations(illumina_seqs, 2):
    a = calcSeqAgreement(combo[0],combo[1])
    scores.append(a[2])
print np.mean(scores)
print np.std(scores)

0.818731013532
0.0510831265215
