In [None]:
import re
import itertools
from pathlib import Path
from typing import Optional
from numpy.random import RandomState
import plotly.express as px

from hmmer import HMMER, read_domtbl, read_domtbl
import gff_io
import hmmer_reader
import pandas as pd
from iseq_prof import pfam
import iseq_prof_analysis as analysis
from fasta_reader import read_fasta
import sam_io
from dna_features_viewer import GraphicFeature, GraphicRecord

In [None]:
# random = RandomState(183)
# meta_filepath = Path("/Users/horta/db/pfam/Pfam-A.hmm.meta.pkl.gz")
# dombtbl_filepath = Path("/Users/horta/ebi/chlamydia/output/assembly/domtblout.txt")

In [None]:
analysis.load_config(verbose=True)
EVALUE_THRSHOLD = 1e-10
RNAME = "2"

In [None]:
root = analysis.config.chlamydia.root_dir
hybrid_consensus = analysis.config.chlamydia.hybrid_consensus
output_dir = root / "output_depth49_200targets"

In [None]:
hybrid = list(read_fasta(root / hybrid_consensus))
print(f"# {hybrid_consensus}")
print(f"Number of targets: {len(hybrid)}")
print(f"Target 2: >{hybrid[1].defline}")

In [None]:
clans = pfam.Clans()
def normalize_clan_name(name: Optional[str]) -> str:
    if name is None:
        return "Unclassified"
    return name

## Prokka+HMMER3

In [None]:
assembly_gffs = {}

for assembly_gff in gff_io.read_gff(output_dir / "prokka" / "assembly.gff"):
    ID = assembly_gff.attributes_asdict()["ID"]
    assembly_gffs[ID] = assembly_gff

features = []
colors = itertools.cycle(px.colors.qualitative.Plotly)
clan_colors = {}
for domtbl_row in read_domtbl(output_dir / "assembly" / "domtblout.txt"):
    assembly_gff = assembly_gffs[domtbl_row.query.name]
    offset = int(assembly_gff.start) - 1

    alifrom = (domtbl_row.ali_coord.start - 1) * 3
    alito = domtbl_row.ali_coord.stop * 3

    # [start, end)
    start = offset + alifrom
    end = offset + alito
    
    profile_name = domtbl_row.target.name
    profile_clan = normalize_clan_name(clans.get(domtbl_row.target.accession))
    if profile_clan not in clan_colors:
        clan_colors[profile_clan] = next(colors)

    strand = int(assembly_gff.strand + "1")
    feature = GraphicFeature(start=start, end=end, strand=strand,
                             color=clan_colors[profile_clan],
                             label=profile_name)
    features.append(feature)

record = GraphicRecord(sequence_length=len(hybrid[1].sequence), features=features)
ax = record.plot(figure_width=20)[0]
ax.figure.savefig('prokka_on_hybrid_depth49_consensus.png', bbox_inches='tight')
# record.plot(figure_width=20);

In [None]:
# sam_items = sam_io.read_sam(output_dir / "alignment.sam").read_items()
# sam_items = [i for i in sam_items if i.rname == RNAME]

In [None]:
sam_items = {}

for sam_item in sam_io.read_sam(output_dir / "alignment.sam"):
    if sam_item.rname != RNAME:
        continue
    if sam_item.flag.value != 0:
        continue
    assert sam_item.qname not in sam_items
    sam_items[sam_item.qname] = sam_item

In [None]:
start_clipping = re.compile("^([\d]+)S")
stop_clipping = re.compile(".*[^\d]([\d]+)S$")

start_clip = {}
stop_clip = {}
for sam_item in sam_items.values():
    start_clip[sam_item.qname] = 0
    stop_clip[sam_item.qname] = 0

    m = re.match(start_clipping, sam_item.cigar)
    if m is not None:
        start_clip[sam_item.qname] = int(m.groups()[0])
    
    m = re.match(stop_clipping, sam_item.cigar)
    if m is not None:
        stop_clip[sam_item.qname] = int(m.groups()[0])

In [None]:
features = []
hybrid_length = len(hybrid[1].sequence)
for item in gff_io.read_gff(output_dir / "output.gff"):

    atts = item.attributes_asdict()
    if item.seqid not in alignment_offset:
        continue
    if float(atts["E-value"]) > EVALUE_THRSHOLD:
        continue
        
    profile_name = atts["Profile_name"]
    profile_clan = normalize_clan_name(clans.get(atts["Profile_acc"]))
    if profile_clan not in clan_colors:
        clan_colors[profile_clan] = next(colors)

    start = int(item.start) - 1 - start_clip[item.seqid]
    end = int(item.end) - start_clip[item.seqid]
    if end <= 0 or start >= hybrid_length:
        continue
#     print(start, end)

    strand = int(item.strand + "1")
    feature = GraphicFeature(start=start, end=end, strand=strand,
                             color=clan_colors[profile_clan],
                             label=profile_name)
    features.append(feature)

record = GraphicRecord(sequence_length=hybrid_length, features=features)
ax = record.plot(figure_width=20)[0]
ax.figure.savefig('iseq_on_200_sequences.png', bbox_inches='tight')
# record.plot(figure_width=20);

In [None]:
max(starts)

In [None]:
item

In [None]:
alignment_offset["851c3f75-66d5-41df-babd-f96959aa5855"]

In [None]:
meta = pd.read_pickle(meta_filepath)

In [None]:
rows = read_domtbl(dombtbl_filepath)

In [None]:
true_profiles = [row.target.accession for row in rows]

In [None]:
all_false_profiles = set(meta["ACC"].tolist()) - set(true_profiles)
false_profiles = list(random.choice(list(all_false_profiles), size=100, replace=False))

In [None]:
hmmer.fetch(true_profiles + false_profiles)