# Analysis of the libraries compared to the clone sequence

Here we analyse the libraries with respect to the clone sequence in which mutations will be introduced.
We print positions that differ from the clone and rise in frequency between the first and the last time points.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
from array import array
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import linregress
%matplotlib inline

# Obtaining the sequence annotation

In [2]:
begins=[]
ends=[]
names =[]
with open ("sequence.gb") as f:
    in_pep = False
    for l in f:
        if "mat_peptide" in l:
            begins.append(int(l.split()[1].split("..")[0]))
            ends.append(int(l.split()[1].split("..")[1]))
            in_pep = True
        elif in_pep :
            names.append(l.split("=")[1])
            in_pep = False
            
print(begins)
print(ends)
print(names)

[108, 474, 753, 978, 2490, 3546, 4224, 4614, 6465, 6846, 6915, 7668]
[473, 752, 977, 2489, 3545, 4223, 4613, 6464, 6845, 6914, 7667, 10376]
['"capsid"\n', '"propeptide"\n', '"membrane"\n', '"envelope"\n', '"NS1"\n', '"NS2A"\n', '"NS2B"\n', '"NS3"\n', '"NS4A"\n', '"2K"\n', '"NS4B"\n', '"NS5"\n']


# Obtaining the clone sequence

In [3]:
file = "cloneSequence/SP6-ZIKV_seq_only.txt"
clone = ""
with open (file) as f:
    for l in f:
        if ">" in l:
            pass
        else:
            clone +=l.strip()

# Functions to plot interesting positions and gene boundaries

In [4]:
# Interesting positions
positions=[316,1670,1785,2340,5935,7172,8449,9165]
def plot_positions():
    for x in positions:
        plt.axvline(x=x, linewidth=1, linestyle=':')
        
def plot_genes():
    for i in range(len(begins)):
        plt.plot([begins[i], begins[i]], [0.99,1.0], linewidth=2, linestyle='-', color="black")
        if i%2==0:
            plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.005, (names[i].replace('"', ''))[0:3], size='xx-small')
        else:
            plt.text (begins[i] + ((ends[i] - begins[i])/10), 1.015, (names[i].replace('"', ''))[0:3], size='xx-small')
    plt.plot([ends[-1], ends[-1]], [0.99,1.0], linewidth=2, linestyle='-', color="black")

# Functions to detect variants that differ from the clone and increase in frequency
Here we assume that the major variant is the state in the clone sequence, and we attempt to detect variants that are not in the clone but that increase through time.

In [5]:
def is_increasing(minor_frequencies):
    #print(minor_frequencies)
    previous = minor_frequencies[0]
    for m in range(1,len(minor_frequencies)):
        if previous < minor_frequencies[m]:
            #print(str(previous) + " < " + str(minor_frequencies[m]))
            previous = minor_frequencies[m]
        else:
            return False
    return True

def get_variant_frequency(variant, table, i):
    sum_of_bases = table['As_quality_corrected'][i]+table['Cs_quality_corrected'][i]+table['Gs_quality_corrected'][i]+table['Ts_quality_corrected'][i]+table['Ns_quality_corrected'][i]
    if variant == "A":
        return table["As_quality_corrected"][i] / sum_of_bases
    elif variant == "C":
        return table["Cs_quality_corrected"][i] / sum_of_bases
    elif variant == "G":
        return table["Gs_quality_corrected"][i] / sum_of_bases
    elif variant == "T":
        return table["Ts_quality_corrected"][i] / sum_of_bases
    else:
        return np.nan
        

def get_increasing_variants(tables, clone):
    num_tables = len(tables)
    first = tables[0]
    last = tables[num_tables-1]
    major = ""
    minor = ""
    major_frequencies = array('d',[0.0]*num_tables)
    minor_frequencies = array('d',[0.0]*num_tables)
    increasingVariants = dict()
    for i in first["Position"]:
        major = clone[i] #first["Major_variant"][i]
        #print(last['Major_variant_frequency_quality_corrected'][i])
        major_frequencies[0] = get_variant_frequency(major, first, i) 
        if major == last["Major_variant"][i]:
            minor = last["Second_variant"][i]
        else:
            minor = last["Major_variant"][i]
        minor_frequencies[0] = get_variant_frequency(minor, first, i)
        for table_id in range(1, num_tables):
            major_frequencies[table_id] = get_variant_frequency(major, tables[table_id], i)
            minor_frequencies[table_id] = get_variant_frequency(minor, tables[table_id], i)
        if is_increasing(minor_frequencies):
            increasingVariants[i] = [major, minor, major_frequencies.tolist(), minor_frequencies.tolist()]
    return increasingVariants

def print_variants(dict_variants):
    print("Position\tclone base\tincreasing variant\tFinal frequency")
    for k in dict_variants.keys():
        print(str(k)+"\t"+dict_variants[k][0]+"\t"+dict_variants[k][1]+"\t"+str(dict_variants[k][3][-1]))

# Reading all data

In [6]:
# CirSeq initial sample
cirseq = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1CirseqD3_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")

In [7]:
# Control runs, replicate A
DD3_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD6_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD9_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD12_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD24_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD51_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")
DD51_A_no_reamp = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD51Anoreamplification_1_sequence.txt.assembled.fastq_mapped_AA.csv", na_values=" -nan")

In [8]:
# Control runs, replicate D
DD3_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD3D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD6_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD9_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD12_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD12D_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD24_D = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD24D_1_sequence.txt.assembled.fastq_mapped_AA.csv")

In [9]:
# Control runs, replicate E
DD6_E = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD6E_1_sequence.txt.assembled.fastq_mapped_AA.csv")
DD9_E = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1DD9E_1_sequence.txt.assembled.fastq_mapped_AA.csv")


In [10]:
# TLR3 activation runs, replicate A
TD9_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD9A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD12_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD12A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD24_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD24A_1_sequence.txt.assembled.fastq_mapped_AA.csv")
TD51_A = pd.read_csv ("HV5GLBCXY_ZIKV_17s006139-1-1_DREUX_lane1TD51A_1_sequence.txt.assembled.fastq_mapped_AA.csv")

In [11]:
#DD3_A.describe(include='all')

# Positions that increase in frequency

### Control, replicate A

In [12]:
tables_A = [DD3_A, DD6_A, DD9_A, DD12_A, DD24_A, DD51_A]
increasing_A = get_increasing_variants(tables_A, clone)
print("There are "+str(len(increasing_A))+" positions that rise in frequency.")
print("Those are:")
print_variants(increasing_A)

  from ipykernel import kernelapp as app


There are 92 positions that rise in frequency.
Those are:
Position	clone base	increasing variant	Final frequency
53	T	C	0.0014687147643982735
55	G	T	0.007534085284725918
138	T	C	0.001922840821068939
165	G	T	0.006671351295565951
173	C	T	0.003006519784370717
316	T	C	0.9257185433962264
332	G	T	0.0065362685648320765
357	T	C	0.002416925215029425
491	G	T	0.006092446714592004
824	T	C	0.0035788258135997075
901	T	C	0.0016021462336337357
1124	G	T	0.005141357751182855
1471	G	T	0.00665038967995197
1500	G	T	0.006259729268570128
1552	G	T	0.00582413787633224
1670	G	A	0.9422872506633152
1883	G	T	0.005708083729311799
1951	A	T	0.0010857796411378254
2039	G	T	0.006254807609563537
2101	G	T	0.004561467672983693
2102	C	T	0.0036324873615132206
2235	G	T	0.00525068769414528
2340	C	T	0.9517702913964549
2541	G	T	0.006747314640043678
2702	G	T	0.005630456156895833
2792	C	T	0.003261293410844263
2804	G	T	0.004986371013878872
2811	T	C	0.0015264351528296522
3087	A	C	0.0017289634956359449
3118	G	T	0.006253053863947506
3



### Control, replicate D

In [13]:
tables_D = [DD3_D, DD6_D, DD9_D, DD12_D, DD24_D]
increasing_D = get_increasing_variants(tables_D, clone)
print("There are "+str(len(increasing_D))+" positions that rise in frequency.")
print("Those are:")
print_variants(increasing_D)

  from ipykernel import kernelapp as app


There are 318 positions that rise in frequency.
Those are:
Position	clone base	increasing variant	Final frequency
40	T	G	0.0019018145805032665
138	T	G	0.0017308345476746512
164	G	T	0.0028747315274068423
176	G	T	0.003746452972916831
188	G	T	0.032248881943154374
207	C	T	0.005254698946753452
243	T	G	0.0025127258345428154
254	A	C	0.0012103183990453327
263	T	C	0.0014386049439938202
295	G	T	0.006107744558710584
298	T	C	0.001168757010650079
433	T	G	0.0029516184598376957
466	T	C	0.0024415496100519923
531	G	T	0.005232150497286363
533	G	A	0.08506827845455946
535	G	T	0.0047124831467124624
548	C	A	0.00493994243614136
570	A	C	0.0023983467340025367
599	C	A	0.0038547210141864495
664	C	A	0.003402638792391993
703	G	T	0.0051156132164690785
731	G	T	0.004592770088631973
740	A	C	0.0015919764628898891
796	G	T	0.003995424416014257
860	T	G	0.0018815363575994026
957	T	G	0.00163585699634353
997	C	A	0.004407071762377561
1001	A	G	0.0013013758799981594
1110	T	G	0.0018698757108824436
1118	A	C	0.002007284680139468
1

### Control, replicate E

In [14]:
tables_E = [DD6_E, DD9_E]
increasing_E = get_increasing_variants(tables_E, clone)
print("There are "+str(len(increasing_E))+" positions that rise in frequency.")
print("There are too many of them, we choose not to print them.")

  from ipykernel import kernelapp as app


There are 6381 positions that rise in frequency.
There are too many of them, we choose not to print them.


### TLR3 treatment

In [15]:
tables_TA = [TD9_A, TD12_A, TD24_A, TD51_A]
increasing_TA = get_increasing_variants(tables_TA, clone)
print("There are "+str(len(increasing_TA))+" positions that rise in frequency.")
print("Those are:")
print_variants(increasing_TA)

  from ipykernel import kernelapp as app


There are 241 positions that rise in frequency.
Those are:
Position	clone base	increasing variant	Final frequency
84	T	G	0.0027281173987600575
137	T	G	0.002666240806892203
146	G	T	0.005028061570557455
158	A	C	0.0017398343722252853
168	T	G	0.002136607688947138
226	T	G	0.003627324416051108
233	C	A	0.00577179962442612
269	A	C	0.0023387078857982193
285	A	C	0.0008862968959656499
305	A	C	0.0023289246747486697
322	T	G	0.9927920378671307
415	A	G	0.0017496143646926541
461	A	T	0.0019677944312796207
486	G	T	0.00390509248634693
487	A	C	0.001792997585087918
520	A	C	0.0013100014923567136
580	A	C	0.0020633351206112817
628	A	C	0.0016085417864187095
629	T	G	0.002002077761463745
647	G	T	0.006202388375634518
649	G	T	0.006710822568105718
719	C	A	0.00516901130372524
752	G	T	0.0038576888604680906
832	G	T	0.004925429231517509
856	G	T	0.004877761081021332
877	C	A	0.003958647604943897
916	C	A	0.004438554339703213
917	T	C	0.0014235287145781706
919	A	C	0.0024663131570563874
941	T	G	0.0025523109200046175
958	G	T	