In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import translate
from Bio.Seq import translate, Seq
import argparse
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
import numpy as np
import json
import logging
from logging.handlers import RotatingFileHandler
import traceback

In [2]:
old_resmarker_table = pd.read_csv('real_example/old_script_output/resmarker_table.txt', sep='\t')
old_mhap_table = pd.read_csv('real_example/old_script_output/resmarker_microhap_table.txt', sep='\t')
old_new_mutations = pd.read_csv('real_example/old_script_output/resmarker_new_mutations.txt', sep='\t')

In [3]:
new_resmarker_table = pd.read_csv('resmarker_table_by_locus.txt', sep='\t')
new_collapsed_table = pd.read_csv('resmarker_table.txt', sep='\t')
new_mhap_table = pd.read_csv('resmarker_microhap_table.txt', sep='\t')
new_new_mutations = pd.read_csv('discovery_table.txt', sep='\t')

# Resmarker table comparison 

In [4]:
old_resmarker_table.columns

Index(['SampleID', 'Locus', 'GeneID', 'Gene', 'CodonID', 'RefCodon', 'Codon',
       'CodonStart', 'CodonRefAlt', 'RefAA', 'AA', 'AARefAlt', 'Reads'],
      dtype='object')

In [5]:
new_resmarker_table.columns

Index(['SampleID', 'Locus', 'GeneID', 'Gene', 'CodonID', 'RefCodon', 'Codon',
       'CodonStart', 'CodonRefAlt', 'RefAA', 'AA', 'AARefAlt', 'Reads'],
      dtype='object')

In [6]:
resmarker_table_comparison = old_resmarker_table.merge(new_resmarker_table, how='outer', indicator=True)
resmarker_table_comparison._merge.value_counts()

_merge
both          8745
left_only        0
right_only       0
Name: count, dtype: int64

In [7]:
resmarker_table_comparison

Unnamed: 0,SampleID,Locus,GeneID,Gene,CodonID,RefCodon,Codon,CodonStart,CodonRefAlt,RefAA,AA,AARefAlt,Reads,_merge
0,1K_Control_8070941526_1_S33_L001,Pf3D7_04_v3-748105-748359-1B,417200,dhfr,16,GCA,GCA,3,REF,A,A,REF,697,both
1,1K_Control_8070941526_1_S33_L001,Pf3D7_04_v3-748105-748359-1B,417200,dhfr,51,AAT,ATT,108,ALT,N,I,ALT,697,both
2,1K_Control_8070941526_1_S33_L001,Pf3D7_04_v3-748105-748359-1B,417200,dhfr,59,TGT,CGT,132,ALT,C,R,ALT,697,both
3,1K_Control_8070941526_1_S33_L001,Pf3D7_04_v3-748374-748611-1B,417200,dhfr,108,AGC,AAC,1,ALT,S,N,ALT,1175,both
4,1K_Control_8070941526_1_S33_L001,Pf3D7_04_v3-748374-748611-1B,417200,dhfr,164,ATA,ATA,169,REF,I,I,REF,1175,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8740,TS2-TC1-1k_S83_L001,Pf3D7_13_v3-1724895-1725166-1B,1343700,k13,662,TTT,TTT,122,REF,F,F,REF,519,both
8741,TS2-TC1-1k_S83_L001,Pf3D7_13_v3-1724895-1725166-1B,1343700,k13,675,GCT,GCT,161,REF,A,A,REF,519,both
8742,TS2-TC1-1k_S83_L001,Pf3D7_14_v3-2481033-2481276-1B,1460900,arps10,127,GTG,GTG,4,REF,V,V,REF,915,both
8743,TS2-TC1-1k_S83_L001,Pf3D7_14_v3-1956056-1956330-1B,1447900,mdr2,484,ACA,ACA,74,REF,T,T,REF,386,both


# mhap table

In [8]:
merged_mhap = old_mhap_table.merge(new_mhap_table, how = 'outer', indicator=True)

In [9]:
merged_mhap._merge.value_counts()

_merge
both          2353
left_only        0
right_only       0
Name: count, dtype: int64

# New mutations 

In [10]:
# Sum reads for duplicate codons
old_new_mutations = old_new_mutations.groupby(
    ['SampleID', 'Locus', 'GeneID', 'Gene', 'Position', 'Alt', 'Ref']).Reads.sum().reset_index()

# Ensure reads is integer
old_new_mutations['Reads'] = old_new_mutations['Reads'].astype(int)

In [11]:
merged = old_new_mutations.merge(new_new_mutations, on=['SampleID', 'Locus', 'GeneID', 'Gene', 'Position', 'Alt', ],how='outer', indicator=True)
merged._merge.value_counts()

_merge
both          1114
right_only     163
left_only        0
Name: count, dtype: int64

In [12]:
merged[merged._merge=='right_only'].Locus.unique()

array(['Pf3D7_05_v3-958079-958350-1B', 'Pf3D7_13_v3-748254-748510-1B',
       'Pf3D7_14_v3-1956056-1956330-1B', 'Pf3D7_05_v3-958372-958608-1B',
       'Pf3D7_05_v3-961511-961758-1B'], dtype=object)

In [13]:
old_new_mutations[
    (old_new_mutations.SampleID=='1K_Control_8070941526_1_S33_L001')
    & (old_new_mutations.Locus=='Pf3D7_05_v3-958079-958350-1B')
]

Unnamed: 0,SampleID,Locus,GeneID,Gene,Position,Alt,Ref,Reads


In [14]:
new_new_mutations[
    (new_new_mutations.SampleID=='1K_Control_8070941526_1_S33_L001')
    & (new_new_mutations.Locus=='Pf3D7_05_v3-958079-958350-1B')
]

Unnamed: 0,SampleID,Locus,GeneID,Gene,Position,Alt,Ref,Reads
3,1K_Control_8070941526_1_S33_L001,Pf3D7_05_v3-958079-958350-1B,523000,mdr1,38,T,A,539
4,1K_Control_8070941526_1_S33_L001,Pf3D7_05_v3-958079-958350-1B,523000,mdr1,39,T,A,519


In [15]:
allele_data = pd.read_csv('real_example/allele_data.txt', sep='\t')
allele_data[
    (allele_data.SampleID=='1K_Control_8070941526_1_S33_L001')
    & (allele_data.Locus=='Pf3D7_05_v3-958079-958350-1B')
]

Unnamed: 0,SampleID,Locus,ASV,Reads,Allele,PseudoCIGAR
8421,1K_Control_8070941526_1_S33_L001,Pf3D7_05_v3-958079-958350-1B,TTTTTATATCTGTGTTTGGTGTAATATTAAAGAACATGTTTTTAGG...,519,Pf3D7_05_v3-958079-958350-1B.2,39T40T186+8N
8422,1K_Control_8070941526_1_S33_L001,Pf3D7_05_v3-958079-958350-1B,TTTTTATATCTGTGTTTGGTGTAATATTAAAGAACATGTATTTAGG...,20,Pf3D7_05_v3-958079-958350-1B.3,39T186+8N


In [16]:
2091/697

3.0

In [703]:
# Sum reads for duplicate codons
old_new_mutations = old_new_mutations.groupby(
    ['SampleID', 'Locus', 'GeneID', 'Gene', 'Position', 'Alt', 'Ref']).Reads.sum().reset_index()

# Ensure reads is integer
old_new_mutations['Reads'] = old_new_mutations['Reads'].astype(int)

In [704]:
old_new_mutations[
    (old_new_mutations.SampleID=='1K_Control_8070941526_1_S33_L001')
    & (old_new_mutations.Locus=='Pf3D7_04_v3-748105-748359-1B')
]

Unnamed: 0,SampleID,Locus,GeneID,Gene,Position,Alt,Ref,Reads
0,1K_Control_8070941526_1_S33_L001,Pf3D7_04_v3-748105-748359-1B,417200,dhfr,109,T,A,1394
1,1K_Control_8070941526_1_S33_L001,Pf3D7_04_v3-748105-748359-1B,417200,dhfr,132,C,T,1394


In [705]:
merged = old_new_mutations.merge(new_new_mutations, how='outer', indicator=True)
merged._merge.value_counts()

_merge
right_only    1256
left_only     1093
both            21
Name: count, dtype: int64