In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
import re
from pathlib import Path

import pandas as pd
from pyfaidx import Fasta

base = Path("../../data/KLK")
raw = base / "raw"
sup_mat = raw / "SM"
fasta_path = sup_mat / "MOESM14_ESM.fasta"
nexus_path = sup_mat / "MOESM4_ESM.txt"
klk_group_file = raw / "KLK_groups.txt"

fasta_out = base / "KLK.fasta"
csv_out = base / "KLK.csv"
itol_file = base / "iTOL.txt"

## Infer species

In [3]:
header_mapper = {
    "Alsi": "Alligator sinensis",
    "BLTx": "Blarina toxin",
    "Calu": "Canis lupus",
    "Cewa": "Celestus warreni",
    "Crvi": "Crotalus viridis",
    "Ereu": "Erinaceus europaeus",
    "Erueu": "Erinaceus europaeus",
    "Euma": "Eublepharis macularius",
    "Gein": "Gerrhonotus infernalis",
    "Hosa": "Homo sapiens",
    "H_susp": "Heloderma suspectum",
    "Laag": "Lacerta agilis",
    "Mumu": "Mus musculus",
    "Nana": "Naja naja",
    "Oran": "Ornithorhynchus anatinus",
    "Pesi": "Pelodiscus sinensis",
    "Phci": "Phascolarctos cinereus",
    "Prmu": "Protobothrops mucrosquamatus",
    "Sopa": "Solenodon paradoxus",
    "Thel": "Thamnophis elegans",
    "Vaac": "Varanus acanthurus",
    "Vagi": "Varanus gilleni",
    "Vagig": "Varanus giganteus",
    "Vagl": "Varanus glauerti",
    "Vain": "Varanus indicus",
    "Vako": "Varanus komodoensis",
    "Vame": "Varanus mertens",
    "Vami": "Varanus mitchelli",
    "Vapa": "Varanus panoptes",
    "Vasc": "Varanus scalaris",
    "Xetr": "Xenopus tropicalis",
}

In [4]:
data = []
abbrs = list(header_mapper.keys())
pat = r"([A-Z][a-z]+_[a-z]+)"
for header, seq in Fasta(fasta_path).items():
    seq = str(seq).replace("-", "")
    for abbr in abbrs:
        if abbr in header:
            species = header_mapper[abbr]
            break
    else:
        m = re.search(pattern=pat, string=header)
        if m is not None:
            species = m[1].replace("_", " ")
        else:
            print(f"Error: connot extract species from '{header}'")
    entry = {"fasta_header": header, "species": species, "seq": seq}
    data.append(entry)
df = pd.DataFrame(data)

### Add KLK groups

In [18]:
# KLK groups
with open(klk_group_file, "r") as handle:
    for line in handle:
        if line.startswith("---"):
            group = line.strip().split(" ", maxsplit=1)[1]
        else:
            fasta_header = (
                line.strip()
                .replace("zgc ", "zgc:")
                .replace("zmp ", "zmp:")
                .replace("isoform ", "isoform=")
                .replace(" ", "_")
                .replace("ID", "ID=")
            )
            if fasta_header not in df["fasta_header"].to_list():
                print(line.strip())
            df.loc[df["fasta_header"] == fasta_header, "group"] = group

df = df[["fasta_header", "group", "species", "seq"]]

# remove duplicates
df = df[~(df["fasta_header"] == "Hosa_KLK_1")]

Hosa KLK 1


In [12]:
df.to_csv(csv_out, index=False)
with open(fasta_out, "w") as handle:
    for idx, row in df.iterrows():
        handle.write(f">{row['fasta_header']}\n")
        handle.write(f"{row['seq']}\n")

### iTOL coloring

In [17]:
import colorsys
from distinctipy import distinctipy

# number of colours to generate
N = len(df["group"].unique())
seed = 42

colorblind_type = "Normal"
colors = distinctipy.get_colors(N, colorblind_type=colorblind_type, rng=seed)
colors = ["#" + "".join([f"{int(c * 255):02X}" for c in color]) for color in colors]
color_group = {g: c for g, c in zip(df["group"].unique(), colors)}

with open(itol_file, "w") as handle:
    handle.write("TREE_COLORS\n")
    handle.write("SEPARATOR TAB\n")
    handle.write("DATA\n")
    for group in df["group"].unique():
        for idx, row in df[df["group"] == group].iterrows():
            uid = (
                row["fasta_header"]
                .replace(":", "_")
                .replace("ID=", "ID")
                .replace("=", "_")
            )
            # group = row["group"]
            color = color_group[group]
            handle.write(f"{uid}\trange\t{color}\t{group}\n")