In [None]:
import os
import sys
import subprocess
import argparse
import logging
import plotly.graph_objects as go
import pysam
from tqdm import tqdm
import plotly.offline as pyo
import numpy as np
pyo.init_notebook_mode()


work_dir is the directory of working folder

In [None]:
with open(os.path.expanduser("~/.vulcan_config"), "r") as config_f:
    parameters = config_f.readlines()

In [None]:
parameters

In [None]:
threads = int(parameters[0])
percentile = int(parameters[1])
work_dir = parameters[2][:-1]
ngmlr_above_bam = parameters[3][:-1]
ngmlr_above_sam = parameters[4][:-1]
ngmlr_above_edit_distance = parameters[5][:-1]
minimap2_above_bam = parameters[6][:-1]
minimap2_above_sam = parameters[7][:-1]
minimap2_above_edit_distance = parameters[8][:-1]
minimap2_full_bam = parameters[9][:-1]
minimap2_full_edit_distance = parameters[10][:-1]
final_bam = parameters[11][:-1]
final_sam = parameters[12][:-1]
final_edit_distance = parameters[13][:-1]
raw_edit_distance = bool(int(parameters[14]))

In [None]:
def get_distance_from_file(distance_file):
    with open(distance_file, "r") as distance_f:
        distance = distance_f.readlines()
    distance_list = []
    for dist in distance:
        distance_list.append(float(dist[:-1].split(":")[2]))
    return distance_list

In [None]:
def edit_distance_compare(ngmlr_distance_file, minimap2_distance_file, raw_edit_distance, hybrid):
#     ngmlr_distance.sort()
#     minimap2_distance.sort()
    fig = go.Figure()

    ngmlr_distance = get_distance_from_file(ngmlr_distance_file)
    minimap2_distance = get_distance_from_file(minimap2_distance_file)
    lendic_minimap2 = {}
    lencnts_minimap2=[]
    if raw_edit_distance:
        for i in minimap2_distance:
            temploc = int(i/100)*100
            if temploc not in lendic_minimap2:
                lendic_minimap2.update({temploc:0})
            else:
                lendic_minimap2[temploc] += 1
        lens_minimap2 =  [str(i) for i in np.array(list(sorted(lendic_minimap2.keys())))]
        for i in sorted(lendic_minimap2.keys()):
            lencnts_minimap2.append(lendic_minimap2[i])


        lendic_ngmlr = {}
        lencnts_ngmlr = []
        for i in ngmlr_distance:
            temploc = int(i/100)*100
            if temploc not in lendic_ngmlr:
                lendic_ngmlr.update({temploc:0})
            else:
                lendic_ngmlr[temploc] += 1

        lens_ngmlr =  [str(i) for i in np.array(list(sorted(lendic_ngmlr.keys())))]
        for i in sorted(lendic_ngmlr.keys()):
            lencnts_ngmlr.append(lendic_ngmlr[i])
        fig.update_layout(
            title="Raw edit distance distribution",
            xaxis_title="edit distances",
            yaxis_title="Numbers",
            )
    else:
        for i in minimap2_distance:
            temploc = int(i/0.005)*0.005
            if temploc not in lendic_minimap2:
                lendic_minimap2.update({temploc:0})
            else:
                lendic_minimap2[temploc] += 1
        lens_minimap2 =  [str(i) for i in np.array(list(sorted(lendic_minimap2.keys())))]
        for i in sorted(lendic_minimap2.keys()):
            lencnts_minimap2.append(lendic_minimap2[i])

        lendic_ngmlr = {}
        lencnts_ngmlr = []
        for i in ngmlr_distance:
            temploc = int(i/0.005)*0.005
            if temploc not in lendic_ngmlr:
                lendic_ngmlr.update({temploc:0})
            else:
                lendic_ngmlr[temploc] += 1

        lens_ngmlr =  [str(i) for i in np.array(list(sorted(lendic_ngmlr.keys())))]
        for i in sorted(lendic_ngmlr.keys()):
            lencnts_ngmlr.append(lendic_ngmlr[i])
        fig.update_layout(
            title="Normalized edit distance distribution",
            xaxis_title="edit distances",
            yaxis_title="Numbers",
            )
    fig.add_trace(go.Bar(x= lens_minimap2, y=lencnts_minimap2,  name='minimap2'))
    fig.add_trace(go.Scatter(x= lens_minimap2, y=lencnts_minimap2,  name='minimap2'))
    if hybrid:
        fig.add_trace(go.Bar(x= lens_ngmlr, y=lencnts_ngmlr, name='hybrid_NGMLR'))
        fig.add_trace(go.Scatter(x= lens_ngmlr, y=lencnts_ngmlr, name='hybrid_NGMLR'))
    else:
        fig.add_trace(go.Bar(x= lens_ngmlr, y=lencnts_ngmlr, name='NGMLR'))
        fig.add_trace(go.Scatter(x= lens_ngmlr, y=lencnts_ngmlr, name='NGMLR'))

    fig.show()

In [None]:
def bam_to_sam(input_bam, output_sam):
    transform_cmd = f"samtools view -h -@ {threads} {input_bam} -o {output_sam}"
#     logger.info(transform_cmd)
    os.system(transform_cmd)

In [None]:
def generate_distance_file(input_sam, output_txt, raw_edit_distance):
    if raw_edit_distance:
        os.system(f"grep -o -E \"NM:i:[0-9]+\" {input_sam} > {output_txt}")
    else:
        samfile = pysam.AlignmentFile(input_sam, "r")
        with open(output_txt, "w") as out_f:
            for read in tqdm(samfile.fetch()):
                tags = dict(read.tags)
                if "NM" in tags:
                    NM_distance = int(tags["NM"])
                    normalized_edit_distance = float(
                        NM_distance)/read.query_alignment_length
                    out_f.writelines(f"NM:i:{normalized_edit_distance}\n")
    return output_txt

In [None]:
bam_to_sam(ngmlr_above_bam, ngmlr_above_sam)

In [None]:
bam_to_sam(minimap2_above_bam, minimap2_above_sam)

In [None]:
generate_distance_file(minimap2_above_sam, minimap2_above_edit_distance, raw_edit_distance)

In [None]:
edit_distance_compare(ngmlr_above_edit_distance, minimap2_above_edit_distance, raw_edit_distance, hybrid=False)