## import stuff

In [1]:
#importing things
import pandas as pd
import numpy as np
import csv
import pylab as pl
import matplotlib.pyplot as plt
import copy
import scipy
from scipy.stats import linregress
from scipy.stats import t
from scipy.stats import sem
from scipy.stats import chi2
from scipy.stats import ttest_ind_from_stats
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import ttest_ind_from_stats
from collections import OrderedDict
import seaborn as sns

## fetch the data

In [2]:
cells = pd.DataFrame(pd.read_csv('ch65i_cell_counts.csv',delimiter=','))
cells

Unnamed: 0,CH65_i_MI_neg_a,CH65_i_MI_pos_a,CH65_i_MI_neg_b,CH65_i_MI_pos_b,CH65_i_MI_neg_c,CH65_i_MI_pos_c,CH65_i_SI_G189D_neg_a,CH65_i_SI_G189D_pos_a,CH65_i_SI_G189D_neg_b,CH65_i_SI_G189D_pos_b,CH65_i_SI_G189D_neg_c,CH65_i_SI_G189D_pos_c
0,2942707,67100,2931639,61508,2916019,40503,2513389,451945,2454688,486049,2372675,553796


In [3]:
#read in data
counts=pd.DataFrame()
counts = pd.DataFrame(pd.read_csv('count_table.tsv',delimiter='\t',dtype={'geno': str}))
counts

Unnamed: 0,geno,CH65_i_MI_neg_a,CH65_i_MI_pos_a,CH65_i_MI_neg_b,CH65_i_MI_pos_b,CH65_i_MI_neg_c,CH65_i_MI_pos_c,CH65_i_SI_G189D_neg_a,CH65_i_SI_G189D_pos_a,CH65_i_SI_G189D_neg_b,CH65_i_SI_G189D_pos_b,CH65_i_SI_G189D_neg_c,CH65_i_SI_G189D_pos_c,CH65_i_unsorted
0,0000000000000000,697,4,547,4,256,12,65,2,146,0,91,1,63
1,0000000000000001,575,2,438,3,232,13,71,0,113,1,106,1,45
2,0000000000000010,650,7,563,2,374,6,76,0,124,1,97,0,69
3,0000000000000011,742,6,562,18,337,19,72,1,136,0,116,2,62
4,0000000000000100,653,6,507,10,479,14,153,1,112,0,89,5,94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65531,1111111111111011,393,3,279,19,257,24,4,48,8,66,2,78,42
65532,1111111111111100,357,4,232,3,198,10,9,34,7,51,7,29,19
65533,1111111111111101,327,7,240,2,155,5,9,35,3,40,3,39,33
65534,1111111111111110,377,7,281,5,315,11,31,36,13,40,8,74,61


## Calculate fraction bound for each replicate and antigen

In [4]:
#for each variant infer number of cells in that bin using read fraction and cell counts
counts['CH65_i_MI_neg_a_norm'] = (counts['CH65_i_MI_neg_a']/counts['CH65_i_MI_neg_a'].sum())*cells['CH65_i_MI_neg_a'].mean()
counts['CH65_i_MI_neg_b_norm'] = (counts['CH65_i_MI_neg_b']/counts['CH65_i_MI_neg_b'].sum())*cells['CH65_i_MI_neg_b'].mean()
counts['CH65_i_MI_neg_c_norm'] = (counts['CH65_i_MI_neg_c']/counts['CH65_i_MI_neg_c'].sum())*cells['CH65_i_MI_neg_c'].mean()
counts['CH65_i_MI_pos_a_norm'] = (counts['CH65_i_MI_pos_a']/counts['CH65_i_MI_pos_a'].sum())*cells['CH65_i_MI_pos_a'].mean()
counts['CH65_i_MI_pos_b_norm'] = (counts['CH65_i_MI_pos_b']/counts['CH65_i_MI_pos_b'].sum())*cells['CH65_i_MI_pos_b'].mean()
counts['CH65_i_MI_pos_c_norm'] = (counts['CH65_i_MI_pos_c']/counts['CH65_i_MI_pos_c'].sum())*cells['CH65_i_MI_pos_c'].mean()
counts['CH65_i_SI_G189D_neg_a_norm'] = (counts['CH65_i_SI_G189D_neg_a']/counts['CH65_i_SI_G189D_neg_a'].sum())*cells['CH65_i_SI_G189D_neg_a'].mean()
counts['CH65_i_SI_G189D_neg_b_norm'] = (counts['CH65_i_SI_G189D_neg_b']/counts['CH65_i_SI_G189D_neg_b'].sum())*cells['CH65_i_SI_G189D_neg_b'].mean()
counts['CH65_i_SI_G189D_neg_c_norm'] = (counts['CH65_i_SI_G189D_neg_c']/counts['CH65_i_SI_G189D_neg_c'].sum())*cells['CH65_i_SI_G189D_neg_c'].mean()
counts['CH65_i_SI_G189D_pos_a_norm'] = (counts['CH65_i_SI_G189D_pos_a']/counts['CH65_i_SI_G189D_pos_a'].sum())*cells['CH65_i_SI_G189D_pos_a'].mean()
counts['CH65_i_SI_G189D_pos_b_norm'] = (counts['CH65_i_SI_G189D_pos_b']/counts['CH65_i_SI_G189D_pos_b'].sum())*cells['CH65_i_SI_G189D_pos_b'].mean()
counts['CH65_i_SI_G189D_pos_c_norm'] = (counts['CH65_i_SI_G189D_pos_c']/counts['CH65_i_SI_G189D_pos_c'].sum())*cells['CH65_i_SI_G189D_pos_c'].mean()

#for each antigen and replicate compute the percent bound
counts['MIa_frac'] = 100*counts['CH65_i_MI_pos_a_norm']/(counts['CH65_i_MI_pos_a_norm']+counts['CH65_i_MI_neg_a_norm'])
counts['MIb_frac'] = 100*counts['CH65_i_MI_pos_b_norm']/(counts['CH65_i_MI_pos_b_norm']+counts['CH65_i_MI_neg_b_norm'])
counts['MIc_frac'] = 100*counts['CH65_i_MI_pos_c_norm']/(counts['CH65_i_MI_pos_c_norm']+counts['CH65_i_MI_neg_c_norm'])
counts['SIa_frac'] = 100*counts['CH65_i_SI_G189D_pos_a_norm']/(counts['CH65_i_SI_G189D_pos_a_norm']+counts['CH65_i_SI_G189D_neg_a_norm'])
counts['SIb_frac'] = 100*counts['CH65_i_SI_G189D_pos_b_norm']/(counts['CH65_i_SI_G189D_pos_b_norm']+counts['CH65_i_SI_G189D_neg_b_norm'])
counts['SIc_frac'] = 100*counts['CH65_i_SI_G189D_pos_c_norm']/(counts['CH65_i_SI_G189D_pos_c_norm']+counts['CH65_i_SI_G189D_neg_c_norm'])


#for each antigen compute the mean and sem
counts['MI_frac_mean'] = counts[['MIa_frac','MIb_frac','MIc_frac']].mean(axis=1)
counts['SI_frac_mean'] = counts[['SIa_frac','SIb_frac','SIc_frac']].mean(axis=1)
counts['MI_frac_sem'] = counts[['MIa_frac','MIb_frac','MIc_frac']].sem(axis=1)
counts['SI_frac_sem'] = counts[['SIa_frac','SIb_frac','SIc_frac']].sem(axis=1)

counts

Unnamed: 0,geno,CH65_i_MI_neg_a,CH65_i_MI_pos_a,CH65_i_MI_neg_b,CH65_i_MI_pos_b,CH65_i_MI_neg_c,CH65_i_MI_pos_c,CH65_i_SI_G189D_neg_a,CH65_i_SI_G189D_pos_a,CH65_i_SI_G189D_neg_b,...,MIa_frac,MIb_frac,MIc_frac,SIa_frac,SIb_frac,SIc_frac,MI_frac_mean,SI_frac_mean,MI_frac_sem,SI_frac_sem
0,0000000000000000,697,4,547,4,256,12,65,2,146,...,1.023281,0.674447,1.765755,3.969210,0.000000,0.952529,1.154494,1.640579,0.321792,1.196344
1,0000000000000001,575,2,438,3,232,13,71,0,113,...,0.622707,0.631986,2.103529,0.000000,1.165460,0.818840,1.119408,0.661434,0.492068,0.345522
2,0000000000000010,650,7,563,2,374,6,76,0,124,...,1.903155,0.328780,0.611424,0.000000,1.063172,0.000000,0.947786,0.354391,0.484603,0.354391
3,0000000000000011,742,6,562,18,337,19,72,1,136,...,1.435823,2.888166,2.116219,1.831540,0.000000,1.486429,2.146736,1.105990,0.419533,0.561897
4,0000000000000100,653,6,507,10,479,14,153,1,112,...,1.628331,1.798555,1.108353,0.870340,0.000000,4.686107,1.511746,1.852149,0.207596,1.439081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65531,1111111111111011,393,3,279,19,257,24,4,48,8,...,1.356537,5.947489,3.457197,94.158791,91.661888,97.153458,3.587074,94.324712,1.326884,1.587449
65532,1111111111111100,357,4,232,3,198,10,9,34,7,...,1.978552,1.186488,1.899899,83.538375,90.661374,78.381029,1.688313,84.193592,0.251937,3.560136
65533,1111111111111101,327,7,240,2,155,5,9,35,3,...,3.713226,0.767865,1.221871,83.933139,94.671402,91.920354,1.900987,90.174965,0.915549,3.220371
65534,1111111111111110,377,7,281,5,315,11,31,36,13,...,3.236696,1.625401,1.321391,60.937122,80.392154,89.004967,2.061163,76.778081,0.594282,8.301549


## functions for annotating dataframe

In [5]:
#define categories based on mutation position
def find_pos_1(variant):
    pos = [mut for mut in variant]
    if pos[0] == "1":
        return 1
    else:
        return 0
def find_pos_2(variant):
    pos = [mut for mut in variant]
    if pos[1] == "1":
        return 1
    else:
        return 0
def find_pos_3(variant):
    pos = [mut for mut in variant]
    if pos[2] == "1":
        return 1
    else:
        return 0
def find_pos_4(variant):
    pos = [mut for mut in variant]
    if pos[3] == "1":
        return 1
    else:
        return 0
def find_pos_5(variant):
    pos = [mut for mut in variant]
    if pos[4] == "1":
        return 1
    else:
        return 0
def find_pos_6(variant):
    pos = [mut for mut in variant]
    if pos[5] == "1":
        return 1
    else:
        return 0
def find_pos_7(variant):
    pos = [mut for mut in variant]
    if pos[6] == "1":
        return 1
    else:
        return 0
def find_pos_8(variant):
    pos = [mut for mut in variant]
    if pos[7] == "1":
        return 1
    else:
        return 0
def find_pos_9(variant):
    pos = [mut for mut in variant]
    if pos[8] == "1":
        return 1
    else:
        return 0
def find_pos_10(variant):
    pos = [mut for mut in variant]
    if pos[9] == "1":
        return 1
    else:
        return 0
def find_pos_11(variant):
    pos = [mut for mut in variant]
    if pos[10] == "1":
        return 1
    else:
        return 0
def find_pos_12(variant):
    pos = [mut for mut in variant]
    if pos[11] == "1":
        return 1
    else:
        return 0
def find_pos_13(variant):
    pos = [mut for mut in variant]
    if pos[12] == "1":
        return 1
    else:
        return 0
def find_pos_14(variant):
    pos = [mut for mut in variant]
    if pos[13] == "1":
        return 1
    else:
        return 0
def find_pos_15(variant):
    pos = [mut for mut in variant]
    if pos[14] == "1":
        return 1
    else:
        return 0
def find_pos_16(variant):
    pos = [mut for mut in variant]
    if pos[15] == "1":
        return 1
    else:
        return 0
    
#add a column with the number of mutations and save the file
def sum_digits(digit):
    return sum(int(x) for x in digit if x.isdigit())

## output the data

In [6]:
#define categories based on mutation position
counts['pos1'] = counts['geno'].apply(find_pos_1)
counts['pos2'] = counts['geno'].apply(find_pos_2)
counts['pos3'] = counts['geno'].apply(find_pos_3)
counts['pos4'] = counts['geno'].apply(find_pos_4)
counts['pos5'] = counts['geno'].apply(find_pos_5)
counts['pos6'] = counts['geno'].apply(find_pos_6)
counts['pos7'] = counts['geno'].apply(find_pos_7)
counts['pos8'] = counts['geno'].apply(find_pos_8)
counts['pos9'] = counts['geno'].apply(find_pos_9)
counts['pos10'] = counts['geno'].apply(find_pos_10)
counts['pos11'] = counts['geno'].apply(find_pos_11)
counts['pos12'] = counts['geno'].apply(find_pos_12)
counts['pos13'] = counts['geno'].apply(find_pos_13)
counts['pos14'] = counts['geno'].apply(find_pos_14)
counts['pos15'] = counts['geno'].apply(find_pos_15)
counts['pos16'] = counts['geno'].apply(find_pos_16)

#add a column with the number of mutations and save the file
counts['som_mut'] = counts['geno'].apply(sum_digits)
counts.to_csv('20220725_fracbound.csv', index=False)