In [1]:
import primer3
from Bio import SeqIO
from Bio.Seq import Seq
import re
from matplotlib import pyplot as plt
from typing import Dict, List
root_dir="/home/lshas17/"
import pandas as pd
from itertools import product
from data_classes import BlastResult, PrimerPair, Primer

primer3 is a package in primer3-py - a python wrapper for primer3 engine

https://libnano.github.io/primer3-py

https://github.com/primer3-org/primer3

In [2]:
ref_seq=""
for seq in SeqIO.parse(f'{root_dir}/HandyAmpliconTool/test_data/inputs/GCA_000195995.1_for_VCFs.fasta', "fasta"):
    if seq.id=="AL513382_143N_pHCM1_120N_pHCM2":
        ref_seq=seq

In [3]:
existing_primers=pd.read_csv(f'{root_dir}/HandyAmpliconTool/test_data/inputs/primer_Tms.tsv', sep="\t")
existing_primers_txt=""
for index in existing_primers.index:
    existing_primers_txt=existing_primers_txt+f'>{existing_primers.loc[index, "Primer name"]}_L'+"\n"+f'{existing_primers.loc[index, "Forward Sequence"]}'+"\n"
    existing_primers_txt=existing_primers_txt+f'>{existing_primers.loc[index, "Primer name"]}_R'+"\n"+f'{existing_primers.loc[index, "Reverse Sequence"]}'+"\n"

In [6]:

vcf_file_name=f'{root_dir}/HandyAmpliconTool/test_data/outputs/snps.vcf'
header_rows=0
for line in open(vcf_file_name):
    if line[0]=="#":
        header_rows+=1
vcf_data=pd.read_csv(vcf_file_name, sep="\t", header=0, skiprows=header_rows-1) # -1 because header row also starts with #CHROM



In [8]:
def add_fixed_primer(for_seq: str, rev_seq:str, template: str):
    global_args={
        'PRIMER_OPT_SIZE': 20,
        'PRIMER_OPT_TM': 60.0,
        'PRIMER_MIN_TM': 50.0,
        'PRIMER_MAX_TM': 70.0,
        'PRIMER_PRODUCT_SIZE_RANGE': f'100-{len(template)}'}
    if for_seq!=None:
        global_args['SEQUENCE_PRIMER']=for_seq
    if rev_seq!=None:
        global_args['SEQUENCE_PRIMER_REVCOMP']=rev_seq
    return global_args

def process_p3_output(output: Dict, id_prefix: str) -> List[PrimerPair]:
    primer_pairs=len(output["PRIMER_PAIR"])
    result: List[PrimerPair]=[]
    for i in range(0,primer_pairs):
        forward=Primer(seq=output[f'PRIMER_LEFT_{str(i)}_SEQUENCE'],
                       g_c=output[f'PRIMER_LEFT_{str(i)}_GC_PERCENT'],
                       t_m=output[f'PRIMER_LEFT_{str(i)}_TM'])
        reverse=Primer(seq=output[f'PRIMER_RIGHT_{str(i)}_SEQUENCE'],
                       g_c=output[f'PRIMER_RIGHT_{str(i)}_GC_PERCENT'],
                       t_m=output[f'PRIMER_RIGHT_{str(i)}_TM'])
        new_pair=PrimerPair(name=id_prefix, forward=forward, reverse=reverse)
        new_pair.penalty=output[f'PRIMER_PAIR_{str(i)}_PENALTY']
        result.append(new_pair)
        #output[f'PRIMER_PAIR_{str(i)}_PRODUCT_SIZE']
    print(f'LEFT: {output["PRIMER_LEFT_EXPLAIN"]}')
    print(f'RIGHT: {output["PRIMER_RIGHT_EXPLAIN"]}')
    print(f'PAIR: {output["PRIMER_PAIR_EXPLAIN"]}')
    print("\n")
    return result

In [8]:
target_lineage="4.3.1.2"
interval_len=1500
target_lineage_index=vcf_data.index[ [f for f in vcf_data.index if vcf_data.loc[f, "ID"].find(target_lineage)>-1]  ]

new_primer_pairs: List[PrimerPair] = []

for index in target_lineage_index:
    snp_position=vcf_data.loc[index, "POS"]
    valid_range=range(snp_position-interval_len, snp_position+interval_len)
    interval_indices=[ f for f in vcf_data.index if vcf_data.loc[f, "POS"] in valid_range ]

    interval_left_serovar_snps=[f for f in interval_indices if vcf_data.loc[f,"ID"].find("Serovar")>-1 and f<index ]
    interval_right_serovar_snps=[f for f in interval_indices if vcf_data.loc[f,"ID"].find("Serovar")>-1 and f>index]
    if len(interval_left_serovar_snps)>0 and len(interval_right_serovar_snps)>0:
        print(f'SNP at {snp_position} has serovar SNPs left and right of genotype SNP')
    elif len(interval_right_serovar_snps)>0:
        print(f'SNP at {snp_position} has serovar SNPs right of genotype SNP')
    elif len(interval_left_serovar_snps):
        print(f'SNP at {snp_position} has serovar SNPs left of genotype SNP')
    else:
        print(f'SNP at {snp_position} has no serovar SNPs within {str(interval_len)} region')


#four three cases: 
# 1 - left and right serovar SNPs anchored primers
# 2 - left serovar anchored primer
# 3 - right serovar anchored primer
# 4 - neither side has serovar primer, ignore this kind of SNP

    for left_snp_index, right_snp_index in product(interval_left_serovar_snps, interval_right_serovar_snps):
        print("fixed")
        #case 1
        right_snp_pos=vcf_data.loc[right_snp_index,"POS"]
        left_snp_pos=vcf_data.loc[left_snp_index,"POS"]
        if right_snp_pos-left_snp_pos>interval_len: #distance between SNPs is too long:
            print(f'SNPs too far apart: {left_snp_pos} to {right_snp_pos} vs max len {str(interval_len)}')
            continue
        forward=ref_seq[left_snp_pos-1:left_snp_pos+20]
        reverse=ref_seq[right_snp_pos-20:right_snp_pos]
        seq_args={"SEQUENCE_ID":f'{target_lineage}_{str(snp_position)}', 
                "SEQUENCE_TEMPLATE": str(ref_seq[left_snp_pos-1:right_snp_pos].seq),
                "SEQUENCE_INCLUDED_REGION": [0, right_snp_pos-left_snp_pos+1 ]}
        #add "fixed" to name to show that this pair has both sides fixed
        global_args=add_fixed_primer(for_seq=str(forward.seq),rev_seq=str(reverse.seq.reverse_complement()), template=seq_args["SEQUENCE_TEMPLATE"])
        primers=primer3.bindings.design_primers(seq_args=seq_args, global_args=global_args)
        new_primer_pairs += process_p3_output(primers, id_prefix=f'{target_lineage}_{snp_position}_both_fixed')

    for right_snp_index in interval_right_serovar_snps:
        print("right")
        right_snp_pos=vcf_data.loc[right_snp_index,"POS"]
        reverse=ref_seq[right_snp_pos-20:right_snp_pos]
        seq_args={"SEQUENCE_ID":f'{target_lineage}_{str(snp_position)}', 
                "SEQUENCE_TEMPLATE": str(ref_seq[right_snp_pos-interval_len:right_snp_pos].seq),
                "SEQUENCE_INCLUDED_REGION": [0, interval_len ]}
        global_args=add_fixed_primer(for_seq=None,rev_seq=str(reverse.seq.reverse_complement()),  template=seq_args["SEQUENCE_TEMPLATE"])
        primers=primer3.bindings.design_primers(seq_args=seq_args, global_args=global_args)
        new_primer_pairs += process_p3_output(primers, id_prefix=f'{target_lineage}_{snp_position}_right_fixed')

    for left_snp_index in interval_left_serovar_snps:
        print("left")
        left_snp_pos=vcf_data.loc[left_snp_index,"POS"]
        forward=ref_seq[left_snp_pos-1:left_snp_pos+20] #-1 due to VCF being 1 indexed
        seq_args={"SEQUENCE_ID":f'{target_lineage}_{str(snp_position)}', 
                "SEQUENCE_TEMPLATE": str(ref_seq[left_snp_pos-20:left_snp_pos+interval_len].seq),
                "SEQUENCE_INCLUDED_REGION": [0, interval_len ]}
        global_args=add_fixed_primer(for_seq=str(forward.seq), rev_seq=None, template=seq_args["SEQUENCE_TEMPLATE"])
        primers=primer3.bindings.design_primers(seq_args=seq_args, global_args=global_args)
        new_primer_pairs += process_p3_output(primers, id_prefix=f'{target_lineage}_{snp_position}_both_fixed')



SNP at 3694947 has serovar SNPs left of genotype SNP
left
LEFT: considered 1, ok 1
RIGHT: considered 13283, GC content failed 1017, low tm 3697, high tm 2385, high hairpin stability 2, long poly-x seq 108, ok 6074
PAIR: considered 5, ok 5


left
LEFT: considered 1, ok 1
RIGHT: considered 13283, GC content failed 1017, low tm 3697, high tm 2385, high hairpin stability 2, long poly-x seq 108, ok 6074
PAIR: considered 5, ok 5


left
LEFT: considered 1, ok 1
RIGHT: considered 13283, GC content failed 1017, low tm 3697, high tm 2385, high hairpin stability 2, long poly-x seq 108, ok 6074
PAIR: considered 5, ok 5


SNP at 4379937 has serovar SNPs right of genotype SNP
right
LEFT: considered 13704, GC content failed 14, low tm 2082, high tm 4061, long poly-x seq 44, ok 7503
RIGHT: considered 1, high tm 1, ok 0
PAIR: considered 0, ok 0




In [9]:
interval_len=1000
new_primer_pairs: List[PrimerPair] = []
rev_straight="TGAGTCCGGTAAAACGAGCTC"
rev_seq=Seq(rev_straight[-20:]).reverse_complement()
#rev_straight=rev_straight[-20:]
for_straight="TGCAAGCTGCTTAGTGATCGA"
#for_straight=for_straight[0:20]
rev_pos=str(ref_seq.seq).find(str(rev_seq))+len(rev_straight)
for_pos=str(ref_seq.seq).find(for_straight)

seq_args={"SEQUENCE_ID":f'parC', 
        "SEQUENCE_TEMPLATE": str(ref_seq[for_pos-1:rev_pos+200].seq),
        "SEQUENCE_INCLUDED_REGION": [0, rev_pos-for_pos+200 ]}

global_args=add_fixed_primer(for_seq=for_straight[0:20],rev_seq=str(rev_seq.reverse_complement()),  template=seq_args["SEQUENCE_TEMPLATE"])
primers=primer3.bindings.design_primers(seq_args=seq_args, global_args=global_args)
new_primer_pairs += process_p3_output(primers, id_prefix=f'parC_left_fixed')
print(primers["PRIMER_PAIR_0_PRODUCT_SIZE"])

LEFT: considered 1, ok 1
RIGHT: considered 1, ok 1
PAIR: considered 1, ok 1


841


In [10]:
primers

{'PRIMER_LEFT_EXPLAIN': 'considered 1, ok 1',
 'PRIMER_RIGHT_EXPLAIN': 'considered 1, ok 1',
 'PRIMER_PAIR_EXPLAIN': 'considered 1, ok 1',
 'PRIMER_LEFT_NUM_RETURNED': 1,
 'PRIMER_RIGHT_NUM_RETURNED': 1,
 'PRIMER_INTERNAL_NUM_RETURNED': 0,
 'PRIMER_PAIR_NUM_RETURNED': 1,
 'PRIMER_PAIR': [{'PENALTY': 2.989514473345082,
   'COMPL_ANY_TH': 0.0,
   'COMPL_END_TH': 0.0,
   'PRODUCT_SIZE': 841,
   'PRODUCT_TM': 85.24780601518887}],
 'PRIMER_LEFT': [{'PENALTY': 1.3627677455973526,
   'SEQUENCE': 'TGCAAGCTGCTTAGTGATCG',
   'COORDS': [1, 20],
   'TM': 58.63723225440265,
   'GC_PERCENT': 50.0,
   'SELF_ANY_TH': 15.509353920223248,
   'SELF_END_TH': 1.8412187446441521,
   'HAIRPIN_TH': 45.30619035986496,
   'END_STABILITY': 3.69}],
 'PRIMER_RIGHT': [{'PENALTY': 1.6267467277477294,
   'SEQUENCE': 'GAGTCCGGTAAAACGAGCTC',
   'COORDS': [841, 20],
   'TM': 58.37325327225227,
   'GC_PERCENT': 55.0,
   'SELF_ANY_TH': 2.7267450862772193,
   'SELF_END_TH': 2.7267450862772193,
   'HAIRPIN_TH': 35.471835414

In [84]:
global_args

{'PRIMER_OPT_SIZE': 20,
 'PRIMER_OPT_TM': 60.0,
 'PRIMER_MIN_TM': 50.0,
 'PRIMER_MAX_TM': 70.0,
 'PRIMER_PRODUCT_SIZE_RANGE': '100-1037',
 'SEQUENCE_PRIMER': 'TGGCACAATCACTAAACGCG',
 'SEQUENCE_PRIMER_REVCOMP': 'TCAATCTGCATGCTGACCGT',
 'SEQUENCE_ID': 'parC',
 'SEQUENCE_TEMPLATE': 'GTGGCACAATCACTAAACGCGTCGGGTTTTCGTGATCCGATTCATCGCGCAGGTCGTCCACCATCGGCAGTTTTTTATTGCGCATCTGCGCAGCAATCTGCTCCAGCACTTTTGCGCCAGATACCTGATGCGGCAACGCGGTGATCACCACGGCGCCGTCTTCTTTGGTCCATACCGCGCGCATACGCACGGAGCCACGCCCATTTTCGTAAATTTTACGAATTTCCGCACGAGGGATAATGATCTCCGCTTCGGTCGGGTAATCCGGCCCCTGGACGATATCCAGCAACTGATCCAGCGTCGTTTTCGGCTGCTCAATCAGCGTAATCGCCGCTTTCGCGACTTCGCGCAGGTTATGCGGCGGAATATCTGTTGCCATGCCCACCGCAATACCGGTGGTGCCGTTCAGCAGGATGTTCGGCAGACGCGCCGGTAACATTTTCGGTTCCTGCATGGTGCCGTCGAAGTTTGGCACCCAGTCCGCAGTCCCCTGACCGAGTTCGCTTAACAGCAGCTCGGCGTATTTGGACAGGCGGGATTCGGTATAACGCATCGCCGCGAATGACTTCGGATCATCCGGCGCGCCCCAATTCCCCTGGCCATCGACCAGCGGGTAACGGTAAGAGAACGGCTGCGCCATCAGCACCATGGCTTCATAGCAGGCGCTGTCGCCATGCGGGTGATACTTACCCAGTACGTCGCCAACGGTACGGGCGGAT

In [67]:
primers

{'PRIMER_LEFT_EXPLAIN': 'considered 1, ok 1',
 'PRIMER_RIGHT_EXPLAIN': 'considered 1, ok 1',
 'PRIMER_PAIR_EXPLAIN': 'considered 1, ok 1',
 'PRIMER_LEFT_NUM_RETURNED': 1,
 'PRIMER_RIGHT_NUM_RETURNED': 1,
 'PRIMER_INTERNAL_NUM_RETURNED': 0,
 'PRIMER_PAIR_NUM_RETURNED': 1,
 'PRIMER_PAIR': [{'PENALTY': 7.154039854460677,
   'COMPL_ANY_TH': 12.145096897148676,
   'COMPL_END_TH': 12.145096897148676,
   'PRODUCT_SIZE': 218,
   'PRODUCT_TM': 90.34477991039881}],
 'PRIMER_LEFT': [{'PENALTY': 4.010441453302121,
   'SEQUENCE': 'AGTCCCCTGACCGAGTTCGC',
   'COORDS': [1, 20],
   'TM': 64.01044145330212,
   'GC_PERCENT': 65.0,
   'SELF_ANY_TH': 0.0,
   'SELF_END_TH': 0.0,
   'HAIRPIN_TH': 42.81102110239806,
   'END_STABILITY': 4.7}],
 'PRIMER_RIGHT': [{'PENALTY': 3.143598401158556,
   'SEQUENCE': 'GCGACGTACTGGGTAAGTAT',
   'COORDS': [218, 20],
   'TM': 56.856401598841444,
   'GC_PERCENT': 50.0,
   'SELF_ANY_TH': 0.0,
   'SELF_END_TH': 0.0,
   'HAIRPIN_TH': 43.30209902415834,
   'END_STABILITY': 2.12}

In [6]:
import scipy
print(scipy.stats.binom_test(4,11,p=1/8))
print(scipy.stats.binom_test(1,8,p=4/11))
print(scipy.stats.ttest_ind( [0]*4+[1]*7, [0]*1+[1]*7) ) 

0.03896881360560656
0.27239329076363294
TtestResult(statistic=-1.144942580591663, pvalue=0.26809708157763323, df=17.0)


  print(scipy.stats.binom_test(4,11,p=1/8))
  print(scipy.stats.binom_test(1,8,p=4/11))


In [20]:
from data_classes import BlastResult

#check primers vs reference with existing primers
pairs_to_take=10

with open(f'{root_dir}/HandyAmpliconTool/test_data/inputs/temp.fasta', "w") as temp_fasta:
    temp_fasta.write(existing_primers_txt)
    #prioritise the pairs with both ends on Serovar SNPs
    serovar_both_sides=sorted( [f for f in new_primer_pairs if f.name.find("both_fixed")>-1], key=lambda x: x.penalty)
    min_penalty_pairs=sorted(new_primer_pairs, key=lambda x: x.penalty)[0: min(len(new_primer_pairs), pairs_to_take)] + \
                        serovar_both_sides[0: min(len(new_primer_pairs), pairs_to_take)]
    for pair in min_penalty_pairs:
        temp_fasta.write(f'>{pair.uuid}_L'+"\n")
        temp_fasta.write(pair.forward.seq +"\n")
        temp_fasta.write(f'>{pair.uuid}_R'+"\n")
        temp_fasta.write(pair.reverse.seq+"\n")
    
sensitive_search=!/home/lshas17/utilities/quickblast.sh /home/lshas17//HandyAmpliconTool/test_data/inputs/temp.fasta ~/HandyAmpliconTool/test_data/inputs/GCA_000195995.1_for_VCFs.fasta 5
blast_results: List[BlastResult]=[]
for i, item in enumerate(sensitive_search):
    values=item.split("\t")
    new_result=BlastResult()
    new_result.qseqid=values[0]
    new_result.qstart=int(values[1])
    new_result.qend=int(values[2])
    new_result.sseqid=values[3]
    new_result.sstart=int(values[4])
    new_result.send=int(values[5])
    new_result.pident=float(values[6])
    new_result.evalue=float(values[7])
    new_result.qseq=values[8]
    blast_results.append(new_result)

for i, result in enumerate(blast_results):
    print([f.penalty for f in min_penalty_pairs if f.forward.seq==result.qseq or f.reverse.seq==result.qseq ])   
    for j, result_inner in enumerate(blast_results[i+1:]):
        if abs(result_inner.sstart-result.sstart)<interval_len and result.qseqid[-1]!=result_inner.qseqid[-1]:
            if result.qseqid[0:-2]==result_inner.qseqid[0:-2]:
                print(result.value)
                print(result_inner.value)
                print("Fine, same pair")                
            print("\n")
            for value in [result, result_inner]:
                for primer in [f.forward for f in min_penalty_pairs if f.forward.seq==value.qseq]:
                    primer.ref_start=min( [value.sstart, value.send] )-1 #converting from blast output (1 based), to BED (0 indexed)

                for primer in [f.reverse for f in min_penalty_pairs if f.reverse.seq==value.qseq]:
                    primer.ref_start=min( [value.sstart, value.send] )-1 #converting from blast output (1 based), to BED (0 indexed)
            

[]
[]
[]
[]
[]
[]
[]
[]
[]
3.3.1_3.3_L 1 21 AL513382_143N_pHCM1_120N_pHCM2 1113715 1113735
3.3.1_3.3_R 1 21 AL513382_143N_pHCM1_120N_pHCM2 1114526 1114506
Fine, same pair


[]
[]
4.3.1.1_L 1 21 AL513382_143N_pHCM1_120N_pHCM2 1192763 1192783
4.3.1.1_R 1 20 AL513382_143N_pHCM1_120N_pHCM2 1193464 1193445
Fine, same pair


[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
tviD_L 1 22 AL513382_143N_pHCM1_120N_pHCM2 4520323 4520344
tviD_R 1 21 AL513382_143N_pHCM1_120N_pHCM2 4521046 4521026
Fine, same pair


[]
[]
[]
[]
2.3.2_L 1 21 AL513382_143N_pHCM1_120N_pHCM2 1717827 1717847
2.3.2_R 1 20 AL513382_143N_pHCM1_120N_pHCM2 1718537 1718518
Fine, same pair


[]
[]
[]
[]
acrB_v2_L 1 22 AL513382_143N_pHCM1_120N_pHCM2 522598 522619
acrB_v2_R 1 21 AL513382_143N_pHCM1_120N_pHCM2 523494 523474
Fine, same pair


[]
[]
[]
[]
[]
[0.5512316707744276, 0.5534699146223829, 0.5569654140309126, 0.6253644260896749, 0.6257255372350414]
20ad5ac3-636c-4b43-8159-596ba5e7ff9b_L 1 20 AL513382_143N_pHCM1_120N_pHCM2 3195806 3195825
20ad5a

In [21]:
with open(f'{root_dir}HandyAmpliconTool/test_data/outputs/new_primer.tsv', "w") as output:
    for pair in min_penalty_pairs:
        output.write(pair.value+"\n")
        print(pair.penalty)


0.5512316707744276
0.5534699146223829
0.5569654140309126
0.6253644260896749
0.6257255372350414


In [60]:
import pandas as pd
from collections import Counter

df=pd.read_csv(f'{root_dir}/HandyAmpliconTool/test_data/inputs/TGC_data.csv', sep=",", index_col=0)
Counter(df.index.duplicated())[True]
df.index[df.index.duplicated()].values

array([], dtype=object)

In [4]:
import unittest

class TestStringMethods(unittest.TestCase):

    def test_upper(self):
        self.assertEqual('foo'.upper(), 'FOO')

    def test_isupper(self):
        self.assertTrue('FOO'.isupper())
        self.assertFalse('Foo'.isupper())

    def test_split(self):
        s = 'hello world'
        self.assertEqual(s.split(), ['hello', 'world'])
        # check that s.split fails when the separator is not a string
        with self.assertRaises(TypeError):
            s.split(2)

if __name__ == '__main__':
    unittest.main()

usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/ubuntu/.local/share/jupyter/runtime/kernel-v2-3183398xxBOUeJFtVO.json'


AttributeError: 'tuple' object has no attribute 'tb_frame'