In [None]:
import itertools
from pathlib import Path
from typing import Optional
from numpy.random import RandomState
import plotly.express as px

from hmmer import HMMER, read_domtbl, read_domtbl
import gff_io
import hmmer_reader
import pandas as pd
from iseq_prof import pfam
import iseq_prof_analysis as analysis
from fasta_reader import read_fasta
import sam_io
from dna_features_viewer import GraphicFeature, GraphicRecord

In [None]:
# random = RandomState(183)
# meta_filepath = Path("/Users/horta/db/pfam/Pfam-A.hmm.meta.pkl.gz")
# dombtbl_filepath = Path("/Users/horta/ebi/chlamydia/output/assembly/domtblout.txt")

In [None]:
analysis.load_config(verbose=True)
EVALUE_THRSHOLD = 1e-10
RNAME = "2"

In [None]:
root = analysis.config.chlamydia.root_dir
hybrid_consensus = analysis.config.chlamydia.hybrid_consensus
output_dir = root / "output_depth49_200targets"

In [None]:
hybrid = list(read_fasta(root / hybrid_consensus))
print(f"# {hybrid_consensus}")
print(f"Number of targets: {len(hybrid)}")
print(f"Target 2: >{hybrid[1].defline}")

In [None]:
clans = pfam.Clans()
def normalize_clan_name(name: Optional[str]) -> str:
    if name is None:
        return "Unclassified"
    return name

## Prokka+HMMER3

In [None]:
assembly_gffs = {}

for assembly_gff in gff_io.read_gff(output_dir / "prokka" / "assembly.gff"):
    ID = assembly_gff.attributes_asdict()["ID"]
    assembly_gffs[ID] = assembly_gff

features = []
colors = itertools.cycle(px.colors.qualitative.Plotly)
clan_colors = {}
for domtbl_row in read_domtbl(output_dir / "assembly" / "domtblout.txt"):
    assembly_gff = assembly_gffs[domtbl_row.query.name]
    offset = int(assembly_gff.start) - 1

    alifrom = (domtbl_row.ali_coord.start - 1) * 3
    alito = domtbl_row.ali_coord.stop * 3

    # [start, end)
    start = offset + alifrom
    end = offset + alito
    
    profile_name = domtbl_row.target.name
    profile_clan = normalize_clan_name(clans.get(domtbl_row.target.accession))
    if profile_clan not in clan_colors:
        clan_colors[profile_clan] = next(colors)

    strand = int(assembly_gff.strand + "1")
    feature = GraphicFeature(start=start, end=end, strand=strand,
                             color=clan_colors[profile_clan],
                             label=profile_name)
    features.append(feature)

record = GraphicRecord(sequence_length=len(hybrid[1].sequence), features=features)
# ax = record.plot(figure_width=20)[0]
# ax.figure.savefig('prokka_on_hybrid_depth49_consensus.png', bbox_inches='tight')
record.plot(figure_width=20);

In [None]:
sam_items = sam_io.read_sam(output_dir / "alignment.sam").read_items()
sam_items = [i for i in sam_items if i.rname == RNAME]

In [None]:
len(sam_items)

In [None]:
alignment_offset = {sam_item.qname:int(sam_item.pos)-1 for sam_item in sam_items}

In [None]:
sam_items = sam_io.read_sam(output_dir / "alignment.sam").read_items()
sam_items = [i for i in sam_items if i.qname == "851c3f75-66d5-41df-babd-f96959aa5855"]

In [None]:
len(sam_items)

In [None]:
for i in range(7):
    print(str(sam_items[i].flag), sam_items[i].rname, sam_items[i].pos, sam_items[i].mapq, sam_items[i].rnext, sam_items[i].pnext, sam_items[i].tlen, len(sam_items[i].seq))

In [None]:
len(sam_items[1].seq)

In [None]:
gffs = gff_io.read_gff(output_dir / "output.gff").read_items()

In [None]:
features = []
starts = []
ends = []
for item in gff_io.read_gff(output_dir / "output.gff"):

    atts = item.attributes_asdict()
    if item.seqid not in alignment_offset:
        continue
    if float(atts["E-value"]) > EVALUE_THRSHOLD:
        continue
    if int(item.start) >= 39322:
        break
        
    profile_name = atts["Profile_name"]
    profile_clan = normalize_clan_name(clans.get(atts["Profile_acc"]))
    if profile_clan not in clan_colors:
        clan_colors[profile_clan] = next(colors)

    start = int(item.start)
    end = int(item.end)
    starts.append(start)
    ends.append(end)
    strand = int(item.strand + "1")
    feature = GraphicFeature(start=start, end=end, strand=strand,
                             color=clan_colors[profile_clan],
                             label=profile_name)
    features.append(feature)

record = GraphicRecord(sequence_length=len(hybrid[1].sequence), features=features)
# ax = record.plot(figure_width=20)[0]
# ax.figure.savefig('iseq_on_hybrid_depth49_consensus.png', bbox_inches='tight')
record.plot(figure_width=20);

In [None]:
gffs[0]

In [None]:
max(starts)

In [None]:
item

In [None]:
alignment_offset["851c3f75-66d5-41df-babd-f96959aa5855"]

In [None]:
meta = pd.read_pickle(meta_filepath)

In [None]:
rows = read_domtbl(dombtbl_filepath)

In [None]:
true_profiles = [row.target.accession for row in rows]

In [None]:
all_false_profiles = set(meta["ACC"].tolist()) - set(true_profiles)
false_profiles = list(random.choice(list(all_false_profiles), size=100, replace=False))

In [None]:
hmmer.fetch(true_profiles + false_profiles)