# Import requirements

In [158]:
from __future__ import division
import glob
import math
import pandas as pd
import numpy as np
from itertools import combinations
from collections import Counter
from scipy import stats
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Import Sanger data

In [105]:
sanger_df = pd.read_csv('/Users/greg/Desktop/TMHDSNeuroTat/all_Tat_data_df_2015_12_10.csv')
print sanger_df.shape
sanger_df.head()

(2278, 121)


Unnamed: 0,PatientID,VisitNum,DateOfVisit,Age,YS,Gender,Tissue,Protocol,iCD4,nCD4,...,Pos92,Pos93,Pos94,Pos95,Pos96,Pos97,Pos98,Pos99,Pos100,Pos101
0,A0001,R00,2006-09-12,51,10.767123,Male,,,611,301,...,,,,,,,,,,
1,A0001,R01,2007-08-15,52,11.690411,Male,PBMC,Genomic,611,301,...,-,-,-,-,-,-,-,-,-,-
2,A0001,R02,2008-06-04,53,12.49589,Male,PBMC,Genomic,611,301,...,-,-,-,-,-,-,-,-,-,-
3,A0001,R03,2008-11-11,53,12.934247,Male,PBMC,Genomic,611,301,...,E,R,E,T,E,T,D,P,V,D
4,A0001,R04,2009-11-10,54,13.931507,Male,PBMC,Genomic,611,301,...,E,R,E,T,E,T,D,P,F,D


# Filter dataframe rubbish out

In [106]:
# include only genomic sequences
m2 = sanger_df['Protocol'] == 'Genomic'
sanger_df = sanger_df[m2]
print sanger_df.shape

# include only PBMC sequences
m1 = sanger_df['Tissue'] == 'PBMC'
sanger_df = sanger_df[m1]
print sanger_df.shape

# include only sequences with a TMHD score
m3 = ~sanger_df['TMHDS'].isnull()
sanger_df = sanger_df[m3]
print sanger_df.shape

(918, 121)
(754, 121)
(591, 121)


# Diversity function

In [306]:
# calculate diversity using a dictionary object as input
def diversity_column(counts, hill):
    vals = np.array(counts.values(), dtype=np.float64)
    vals /= vals.sum()
    if hill == 1:
        return np.exp(-np.sum(vals*np.log(vals)))
    else:
        return (vals**hill).sum()**(1/(1-hill))

# Statistical test functions

In [364]:
def perform_Welch_Ttest(variant_frame1, variant_frame2, attribute):
    a1 = np.array(variant_frame1[attribute])
    a2 = np.array(variant_frame2[attribute])
    a1 = a1[~np.isnan(a1)]
    a2 = a2[~np.isnan(a2)]
    num_a1 = sum(np.isfinite(a1))
    num_a2 = sum(np.isfinite(a2))
    mean1 = np.mean(a1)
    mean2 = np.mean(a2)
    t_stat, p_val = stats.ttest_ind(a1, a2, equal_var = False)
    return t_stat, p_val, (num_a1, num_a2), (mean1, mean2)

def perform_ks_test(variant_frame1, variant_frame2, attribute):
    a1 = np.array(variant_frame1[attribute])
    a2 = np.array(variant_frame2[attribute])
    a1 = a1[~np.isnan(a1)]
    a2 = a2[~np.isnan(a2)]
    num_a1 = sum(np.isfinite(a1))
    num_a2 = sum(np.isfinite(a2))
    ks_stat, p_val = stats.ks_2samp(a1, a2)
    return ks_stat, p_val, (num_a1, num_a2)

def perform_mannwhitneyU_test(variant_frame1, variant_frame2, attribute):
    a1 = np.array(variant_frame1[attribute])
    a2 = np.array(variant_frame2[attribute])
    a1 = a1[~np.isnan(a1)]
    a2 = a2[~np.isnan(a2)]
    num_a1 = sum(np.isfinite(a1))
    num_a2 = sum(np.isfinite(a2))
    u_stat, p_val = stats.mannwhitneyu(a1, a2)
    return u_stat, p_val, (num_a1, num_a2)

#chi square test code
def perform_chi2_test(variant_frame1, variant_frame2, attribute, threshold):
    a1 = np.array(variant_frame1[attribute])
    a2 = np.array(variant_frame2[attribute])
    a1 = a1[~np.isnan(a1)]
    a2 = a2[~np.isnan(a2)]
    num_a1 = sum(np.isfinite(a1))
    num_a2 = sum(np.isfinite(a2))
    group1count1, group1count2 = sum(a1 < threshold), sum(a1 >= threshold)
    group2count1, group2count2 = sum(a2 < threshold), sum(a2 >= threshold)
    obs = np.array([[group1count1, group1count2], [group2count1, group2count2]])
    chi2, p_val, dof, expect = stats.chi2_contingency(obs, correction=False)
    return p_val, (num_a1, num_a2), (chi2, p_val, dof, expect)

# Execution of code

In [373]:
aa_list = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']
clin_cols = ['PatientID', 'VisitNum', 'DateOfVisit', 'Age', 'YS', 'Gender',
             'Tissue', 'Protocol', 'iCD4', 'nCD4', 'CD4','iCD8', 'nCD8', 'CD8',
             'iVL', 'pVL', 'VL', 'ART', 'TMHDS', 'GDS']

position_dict = {'Position':[], 'Richness':[], 'Diversity':[], 'Variants':[]}

total_tests_performed = 0

for i in np.arange(1,102):
    position_col = 'Pos%02i' % i
    p = sanger_df[position_col]
    position_mask = p.isin(aa_list)
    position_variants = p[position_mask]
    variants_dict = Counter(position_variants)
    richness = diversity_column(variants_dict, 0)
    diversity = diversity_column(variants_dict, 1)
    cols = clin_cols + [position_col]
    position_frame = sanger_df[position_mask][cols]
    # append to position_dict
    position_dict['Position'].append(i)
    position_dict['Richness'].append(richness)
    position_dict['Diversity'].append(diversity)
    position_dict['Variants'].append(variants_dict)
    
    # now process the variants dictionary
    L1 = variants_dict.items()
    for combo in combinations(L1, 2):
        variant1, variant2 = combo[0], combo[1]
        variant_mask1 = position_frame[position_col] == variant1[0]
        variant_mask2 = position_frame[position_col] == variant2[0]
        variant_frame1 = position_frame[variant_mask1]
        variant_frame2 = position_frame[variant_mask2]

        if variant_frame1.shape[0] > 10 and variant_frame2.shape[0] > 10:            
            #t, p = perform_Welch_Ttest(variant_frame1, variant_frame2, '')
            #ks, p_Val, n = perform_ks_test(variant_frame1, variant_frame2, 'TMHDS')
            u, p_val, n = perform_mannwhitneyU_test(variant_frame1, variant_frame2, 'GDS')

            
            if min(n) > 10:
                total_tests_performed += 1
                if p_val <= 0.1:                     
                    print (position_col, p_val, ks, n,
                           variant1[0], variant_frame1.shape[0],
                           variant2[0], variant_frame2.shape[0])
                
print total_tests_performed

('Pos61', 0.04288348633568418, 0.11176470588235293, (47, 11), 'S', 394, 'R', 51)
9


In [321]:
pd.DataFrame(position_dict).head()

Unnamed: 0,Diversity,Position,Richness,Variants
0,1.0,1,1,{u'M': 432}
1,1.028752,2,2,"{u'K': 2, u'E': 451}"
2,1.547856,3,3,"{u'P': 414, u'S': 50, u'T': 10}"
3,1.208927,4,4,"{u'Q': 1, u'E': 12, u'L': 5, u'V': 461}"
4,1.0,5,1,{u'D': 481}


In [374]:
attribute = 'TMHDS'
position_col = 'Pos61'
threshold = 10

#####################################

aa_list = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']
clin_cols = ['PatientID', 'VisitNum', 'DateOfVisit', 'Age', 'YS', 'Gender',
             'Tissue', 'Protocol', 'iCD4', 'nCD4', 'CD4','iCD8', 'nCD8', 'CD8',
             'iVL', 'pVL', 'VL', 'ART', 'TMHDS', 'GDS']

#####################################

p = sanger_df[position_col]
position_mask = p.isin(aa_list)
position_variants = p[position_mask]
variants_dict = Counter(position_variants)
richness = diversity_column(variants_dict, 0)
diversity = diversity_column(variants_dict, 1)
cols = clin_cols + [position_col]
position_frame = sanger_df[position_mask][cols]
print 'Dataframe shape:', position_frame.shape

#####################################

print variants_dict, '\n'

#####################################

L1 = variants_dict.items()
for combo in combinations(L1, 2):
    variant1, variant2 = combo[0], combo[1]
    
    variant_mask1 = position_frame[position_col] == variant1[0]
    variant_mask2 = position_frame[position_col] == variant2[0]
    variant_frame1 = position_frame[variant_mask1]
    variant_frame2 = position_frame[variant_mask2]
    if variant_frame1.shape[0] > 10 and variant_frame2.shape[0] > 10:
        print variant1, variant2
        
        #perform_chi2_test(variant_frame1, variant_frame2, 'TMHDS', 10)
        
        a1 = np.array(variant_frame1[attribute])
        a2 = np.array(variant_frame2[attribute])
        a1 = a1[~np.isnan(a1)]
        a2 = a2[~np.isnan(a2)]
        group1count1, group1count2 = sum(a1 < threshold), sum(a1 >= threshold)
        group2count1, group2count2 = sum(a2 < threshold), sum(a2 >= threshold)
        obs = np.array([[group1count1, group1count2], [group2count1, group2count2]])
        chi2, p_val, dof, expect = stats.chi2_contingency(obs, correction=False)
        print p_val, dof
        print expect
        print obs
        print ''


Dataframe shape: (481, 21)
Counter({'S': 394, 'R': 51, 'N': 19, 'C': 6, 'K': 6, 'H': 3, 'I': 1, 'Q': 1}) 

('N', 19) ('S', 394)
0.859969949933 1
[[   8.37288136   10.62711864]
 [ 173.62711864  220.37288136]]
[[  8  11]
 [174 220]]

('N', 19) ('R', 51)
0.938155510901 1
[[  8.14285714  10.85714286]
 [ 21.85714286  29.14285714]]
[[ 8 11]
 [22 29]]

('S', 394) ('R', 51)
0.889634241956 1
[[ 173.53707865  220.46292135]
 [  22.46292135   28.53707865]]
[[174 220]
 [ 22  29]]

