In [31]:
import os
import sys
import subprocess
import argparse
import logging
import plotly.graph_objects as go
import pysam
from tqdm import tqdm
import numpy as np

In [12]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


work_dir is the directory of working folder

In [51]:
threads = 60
percentile = 90
work_dir = "../Saccharomyces_cerevisiae/work/"
ngmlr_above_bam = os.path.join(work_dir, f"ngmlr_above{percentile}.bam")
ngmlr_above_sam = os.path.join(work_dir, f"ngmlr_above{percentile}.sam")
ngmlr_above_edit_distance = os.path.join(work_dir, f"ngmlr_above{percentile}_distance.txt")
minimap2_above_bam = os.path.join(work_dir, f"minimap2_above{percentile}.bam")
minimap2_above_sam = os.path.join(work_dir, f"minimap2_above{percentile}.sam")
minimap2_above_edit_distance = os.path.join(work_dir, f"minimap2_above{percentile}_distance.txt")
minimap2_full_edit_distance = os.path.join(work_dir, "minimap2_full_distance.txt")
final_bam = "../Saccharomyces_cerevisiae/work/final.bam"
final_sam = "../Saccharomyces_cerevisiae/work/final.sam"
final_edit_distance = "../Saccharomyces_cerevisiae/work/final_edit_distance.txt"
raw_edit_distance = False

In [28]:
def get_distance_from_file(distance_file):
    with open(distance_file, "r") as distance_f:
        distance = distance_f.readlines()
    distance_list = []
    for dist in distance:
        distance_list.append(float(dist[:-1].split(":")[2]))
    return distance_list

In [46]:
def edit_distance_compare(ngmlr_distance_file, minimap2_distance_file, raw_edit_distance):
#     ngmlr_distance.sort()
#     minimap2_distance.sort()
    fig = go.Figure()

    ngmlr_distance = get_distance_from_file(ngmlr_distance_file)
    minimap2_distance = get_distance_from_file(minimap2_distance_file)
    lendic_minimap2 = {}
    lencnts_minimap2=[]
    if raw_edit_distance:
        for i in minimap2_distance:
            temploc = int(i/100)*100
            if temploc not in lendic_minimap2:
                lendic_minimap2.update({temploc:0})
            else:
                lendic_minimap2[temploc] += 1
        lens_minimap2 =  [str(i) for i in np.array(list(sorted(lendic_minimap2.keys())))]
        for i in sorted(lendic_minimap2.keys()):
            lencnts_minimap2.append(lendic_minimap2[i])


        lendic_ngmlr = {}
        lencnts_ngmlr = []
        for i in ngmlr_distance:
            temploc = int(i/100)*100
            if temploc not in lendic_ngmlr:
                lendic_ngmlr.update({temploc:0})
            else:
                lendic_ngmlr[temploc] += 1

        lens_ngmlr =  [str(i) for i in np.array(list(sorted(lendic_ngmlr.keys())))]
        for i in sorted(lendic_ngmlr.keys()):
            lencnts_ngmlr.append(lendic_ngmlr[i])
        fig.update_layout(
            title="Raw edit distance distribution",
            xaxis_title="edit distances",
            yaxis_title="Numbers",
            )
    else:
        for i in minimap2_distance:
            temploc = int(i/0.005)*0.005
            if temploc not in lendic_minimap2:
                lendic_minimap2.update({temploc:0})
            else:
                lendic_minimap2[temploc] += 1
        lens_minimap2 =  [str(i) for i in np.array(list(sorted(lendic_minimap2.keys())))]
        for i in sorted(lendic_minimap2.keys()):
            lencnts_minimap2.append(lendic_minimap2[i])


        lendic_ngmlr = {}
        lencnts_ngmlr = []
        for i in ngmlr_distance:
            temploc = int(i/0.005)*0.005
            if temploc not in lendic_ngmlr:
                lendic_ngmlr.update({temploc:0})
            else:
                lendic_ngmlr[temploc] += 1

        lens_ngmlr =  [str(i) for i in np.array(list(sorted(lendic_ngmlr.keys())))]
        for i in sorted(lendic_ngmlr.keys()):
            lencnts_ngmlr.append(lendic_ngmlr[i])
        fig.update_layout(
            title="Normalized edit distance distribution",
            xaxis_title="edit distances",
            yaxis_title="Numbers",
            )
    fig.add_trace(go.Bar(x= lens_minimap2, y=lencnts_minimap2,  name='minimap2'))
    fig.add_trace(go.Scatter(x= lens_minimap2, y=lencnts_minimap2,  name='minimap2'))
    fig.add_trace(go.Bar(x= lens_ngmlr, y=lencnts_ngmlr, name='NGMLR'))
    fig.add_trace(go.Scatter(x= lens_ngmlr, y=lencnts_ngmlr, name='NGMLR'))


    fig.show()

In [13]:
def bam_to_sam(input_bam, output_sam):
    transform_cmd = f"samtools view -h -@ {threads} {input_bam} -o {output_sam}"
    logger.info(transform_cmd)
    os.system(transform_cmd)

In [9]:
def generate_distance_file(input_sam, output_txt, raw_edit_distance):
    if raw_edit_distance:
        os.system(f"grep -o -E \"NM:i:[0-9]+\" {input_sam} > {output_txt}")
    else:
        samfile = pysam.AlignmentFile(input_sam, "r")
        with open(output_txt, "w") as out_f:
            for read in tqdm(samfile.fetch()):
                tags = dict(read.tags)
                if "NM" in tags:
                    NM_distance = int(tags["NM"])
                    normalized_edit_distance = float(
                        NM_distance)/read.query_alignment_length
                    out_f.writelines(f"NM:i:{normalized_edit_distance}\n")
    return output_txt

In [15]:
bam_to_sam(ngmlr_above_bam, ngmlr_above_sam)

2020-10-18 02:17:38,907 - __main__ - INFO - samtools view -h -@ 60 ../Saccharomyces_cerevisiae/work/ngmlr_above90.bam -o ../Saccharomyces_cerevisiae/work/ngmlr_above90.sam


In [19]:
bam_to_sam(minimap2_above_bam, minimap2_above_sam)

2020-10-18 02:21:06,088 - __main__ - INFO - samtools view -h -@ 60 ../Saccharomyces_cerevisiae/work/minimap2_above90.bam -o ../Saccharomyces_cerevisiae/work/minimap2_above90.sam


In [17]:
generate_distance_file(ngmlr_above_sam, ngmlr_above_edit_distance, raw_edit_distance)

5987it [00:00, 47811.92it/s]


'../Saccharomyces_cerevisiae/work/ngmlr_above90_distance.txt'

In [20]:
generate_distance_file(minimap2_above_sam, minimap2_above_edit_distance, raw_edit_distance)

6584it [00:00, 46628.16it/s]


'../Saccharomyces_cerevisiae/work/minimap2_above90_distance.txt'

In [47]:
edit_distance_compare(ngmlr_above_edit_distance, minimap2_above_edit_distance, raw_edit_distance)

In [49]:
bam_to_sam(final_bam, final_sam)

2020-10-18 02:38:53,114 - __main__ - INFO - samtools view -h -@ 60 ../Saccharomyces_cerevisiae/work/final.bam -o ../Saccharomyces_cerevisiae/work/final.sam


In [50]:
generate_distance_file(final_sam, final_edit_distance, raw_edit_distance)

61196it [00:01, 41150.31it/s]


'../Saccharomyces_cerevisiae/work/final_edit_distance.txt'

In [52]:
edit_distance_compare(final_edit_distance, minimap2_full_edit_distance, raw_edit_distance)