In [None]:
import config
import itertools
import plotly.express as px
from fasta_reader import read_fasta
from pathlib import Path
from sam_io import read_sam
import iseq_prof
import gff_io
from tqdm.notebook import tqdm
import hmmer
from dna_features_viewer import GraphicFeature, GraphicRecord

In [None]:
root = Path(config.root_dir)
config.root_dir

In [None]:
output_dir = Path("/Users/horta/code/iseq-profmark-nf/chlamydia/output")

In [None]:
hybrid = list(read_fasta(root / config.orig_subdir / config.hybrid_consensus))
print(f"# {config.hybrid_consensus}")
print(f"Number of targets: {len(hybrid)}")
print(f"Target 1: >{hybrid[0].defline}")
print(f"Target 2: >{hybrid[1].defline}")

In [None]:
assembly_gffs = {}

for assembly_gff in gff_io.read_gff(output_dir / "prokka" / "assembly.gff"):
    ID = assembly_gff.attributes_asdict()["ID"]
    assembly_gffs[ID] = assembly_gff

In [None]:
features = []
colors = itertools.cycle(px.colors.qualitative.Plotly)
profile_colors = {}
for domtbl_row in hmmer.read_domtbl(output_dir / "assembly" / "domtblout.txt"):
    assembly_gff = assembly_gffs[domtbl_row.query.name]
    offset = int(assembly_gff.start) - 1

    alifrom = (domtbl_row.ali_coord.start - 1) * 3
    alito = domtbl_row.ali_coord.stop * 3

    # [start, end)
    start = offset + alifrom
    end = offset + alito
    
    profile_name = domtbl_row.target.name
    if profile_name not in profile_colors:
        profile_colors[profile_name] = next(colors)

    strand = int(assembly_gff.strand + "1")
    feature = GraphicFeature(start=start, end=end, strand=strand,
                             color=profile_colors[profile_name],
                             label=profile_name)
    features.append(feature)

record = GraphicRecord(sequence_length=len(hybrid[1].sequence), features=features)
ax = record.plot(figure_width=20)[0]
ax.figure.savefig('prokka_on_hybrid_depth49_consensus.png', bbox_inches='tight')

In [None]:
features = []
e_value_thr = 1e-10
for item in gff_io.read_gff(output_dir / "assembly" / "output.gff"):

    atts = item.attributes_asdict()
    if float(atts["E-value"]) > e_value_thr:
        continue
        
    profile_name = atts["Profile_name"]
    if profile_name not in profile_colors:
        profile_colors[profile_name] = next(colors)

    start = int(item.start)
    end = int(item.end)
    strand = int(item.strand + "1")
    feature = GraphicFeature(start=start, end=end, strand=strand,
                             color=profile_colors[profile_name],
                             label=profile_name)
    features.append(feature)

record = GraphicRecord(sequence_length=len(hybrid[1].sequence), features=features)
ax = record.plot(figure_width=20)[0]
ax.figure.savefig('iseq_on_hybrid_depth49_consensus.png', bbox_inches='tight')

In [None]:
e_value_thr = 1e-10
with gff_io.read_gff(output_dir / "assembly" / "output.gff") as file:
    for item in file.read_items():
        atts = item.attributes_asdict()
        if float(atts["E-value"]) > e_value_thr:
            continue
        print(atts["E-value"], atts["Profile_name"])

In [None]:
# FASTQ: 14-2711_R47_1k.fastq

In [None]:
align_fp = "/Users/horta/code/iseq-profmark-nf/chlamydia/output/alignment.sam"

In [None]:
file = read_sam(align_fp)
file.header

In [None]:
item = file.read_item()

In [None]:
item

In [None]:
item.rname

In [None]:
start = int(item.pos) - 1

In [None]:
item.seq[:10]

In [None]:
hybrid[1].sequence[start:start+10]

In [None]:
import re
from io import StringIO
import urllib.request
import hmmer_reader

prokka_gff = "/Users/horta/code/iseq-profmark-nf/chlamydia/output/prokka/assembly.gff"
pattern = re.compile(".*UniProtKB:([^:,]+)")

gffs = []
for gff in gff_io.read_gff(prokka_gff):
    if "similar to AA sequence:UniProtKB" in gff.attributes:
        gffs.append(gff)

def fetch_pfam_accessions():
    df = hmmer_reader.fetch_metadata("/Users/horta/db/pfam/db.hmm")
    df["accession"] = df["ACC"].str.replace(r"\..*", "")
    return dict(zip(df["accession"], df["ACC"]))


pfam_accessions = fetch_pfam_accessions()

uniprot_accs = set()
for gff in tqdm(gffs):
    m = re.match(pattern, gff.attributes_asdict()["inference"])
    assert len(m.groups()) == 1
    uniprot_accs.add(m.groups()[0])

In [None]:
def fetch_pfam_accessions():
    df = hmmer_reader.fetch_metadata("/Users/horta/db/pfam/P.hmm")
    df["accession"] = df["ACC"].str.replace(r"\..*", "")
    return dict(zip(df["accession"], df["ACC"]))

In [None]:
df = hmmer_reader.fetch_metadata("/Users/horta/db/pfam/Pfam-A.hmm")
df.to_pickle("Pfam-A.meta.pkl.gz")

In [None]:
!pwd

In [None]:
gffs[0].attributes

In [None]:
len(gffs)

In [None]:
gffs[1].attributes

In [None]:
# uniprot_accs

In [None]:
uni_to_pfam = {}

for acc in tqdm(uniprot_accs):
    url = f"https://www.uniprot.org/uniprot/{acc}.txt"
    contents = urllib.request.urlopen(url).read().decode()

    uni_to_pfam[acc] = set()
    for row in StringIO(contents):
        if row.startswith("DR   Pfam"):
            profile = row.split(";")[1].strip()
            full_profile = pfam_accessions.get(profile, None)
            if full_profile is not None:
                uni_to_pfam[acc].add(full_profile)

In [None]:
len(uni_to_pfam)

In [None]:
# for uni, pfams in uni_to_pfam.items():
#     print(uni, pfams)

In [None]:
len(gffs)

In [None]:
gffs[0]

In [None]:
gffs[1]

In [None]:
gffs[2]