In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
import pandas as pd
from pathlib import Path
import xml.etree.ElementTree as ET

base = Path("data")
xml_file = base / "conoserver_protein.xml"
xml_new = xml_file.with_stem(f"{xml_file.stem}_new")
seq_mods_path = base / "seq_mods.csv"
seq_mods_path2 = base / "seq_mods2.csv"
fasta_path = Path("xml_conoserver.fasta")
feature_csv_path = "xml_conoserver.csv"

## Reformat original XML file

In [3]:
from html.entities import name2codepoint

entity_replacer = {
    f"&{html_entity};": f"&#{unicode_codepoint};"
    for html_entity, unicode_codepoint in name2codepoint.items()
}

In [None]:
# replace unreadable/non-standard characters
with open(xml_file, "r") as xml, open(xml_new, "w") as new:
    for idx, line in enumerate(xml):
        if idx == 47453:
            line = line.replace("&mu", "&mu;")
        elif idx == 220188:
            bad_char = 197
            line = line[:bad_char] + line[bad_char + 1 :]
        elif idx in [265620, 265642, 265692, 265717, 265742, 265667]:
            line = line.replace(" & ", " &amp; ")
        for entity, codepoint in entity_replacer.items():
            if entity in line:
                line = line.replace(entity, str(codepoint))
        new.write(line)

In [None]:
# Fix indentation
tree = ET.parse(xml_new)
ET.indent(tree, space="    ", level=0)
tree.write(xml_new)  # , encoding="utf-8")

In [6]:
# read XML file in memory
tree = ET.parse(xml_new)
root = tree.getroot()

In [7]:
# get all `sequenceModifications` and save them in `seq_mod_path`
dict_mod = {}


def get_sequence(entry: ET.Element):
    seq = entry.find("sequence")
    seq_mod = entry.find("sequenceModifications")
    if seq_mod is not None:
        mods = seq_mod.findall("modification")
        for mod in mods:
            mod_attr = mod.attrib
            dict_mod[mod_attr["symbol"]] = mod_attr["name"]
            # yield mod_attr
    return seq


for idx, entry in enumerate(root):
    seq = get_sequence(entry)

result = [
    dict(zip(("symbol", "name"), (symbol, name)))
    for symbol, name in sorted(dict_mod.items())
]
df = pd.DataFrame(result)
df["canonical_aa"] = ""
df.to_csv(seq_mods_path, index=False)

In [8]:
# get tag list
tag_lst = set()
for entry in root:
    for lvl1_elem in entry:
        tag_lst.add(lvl1_elem.tag)
tag_lst = sorted(list(tag_lst))

In [9]:
tags = [
    "alternativeNames",
    "averageMass",
    "class",
    "cysteineFramewrok",
    "extinctionCoefficient",
    "geneSuperfamily",
    "id",
    "isoelecticPoint",
    # "linkOut",
    "monoisotopicMass",
    "name",
    # "note",
    "nucleicAcid",
    "organismDiet",
    "organismLatin",
    "organismRegion",
    "parentID",
    "pharmacologicalFamily",
    # "reference",
    "sequence",
    "sequenceEvidence",
    "sequenceModifications",
    "sequenceRegions",
]
# empty_tag = {tag: pd.NA for tag in tags}

In [10]:
class ParseConoserverXML:
    def __init__(
        self, xml_path: Path, tags: list[str], seq_mod_dict: dict[str, str]
    ) -> None:
        self.xml_path = xml_path
        self.tags = tags
        self.seq_mod_dict = seq_mod_dict

        self.df = pd.DataFrame({})

    def _extract_entry_info(self, entry) -> dict:
        entry_data = {tag: pd.NA for tag in self.tags}
        for lvl1_elem in entry:
            tag = lvl1_elem.tag
            if tag == "alternativeNames":
                data = []
                for alt_name in lvl1_elem:
                    data.append(alt_name.text)
            elif tag == "nucleicAcid":
                data = []
                for nucleic_acid_id in lvl1_elem:
                    data.append(nucleic_acid_id.text)
            elif tag == "parentID":
                data = lvl1_elem.text.replace(" ", "0")
            elif tag == "sequenceModifications":
                data = []
                for mod in lvl1_elem:
                    attrib = mod.attrib
                    attrib["position"] = int(attrib["position"])
                    data.append(attrib)
            elif tag == "sequenceRegions":
                data = []
                for region in lvl1_elem:
                    region_dict = {}
                    for region_elem in region:
                        region_dict[region_elem.tag] = region_elem.text
                    data.append(region_dict)
            elif tag in ["linkOut", "note", "reference"]:
                continue
            else:
                data = lvl1_elem.text
            entry_data[tag] = data
        return entry_data

    def _adapt_sequences(self, df: pd.DataFrame) -> None:
        for idx, row in df.iterrows():
            seq = row["sequence"]
            # modify the sequrnces from the back to the front
            mods = row["sequenceModifications"]
            if mods is not pd.NA:
                seq = list(seq)
                mods = sorted(mods, key=lambda d: d["position"], reverse=True)
                for mod in mods:
                    aa_pos = mod["position"] - 1
                    new_aa = self.seq_mod_dict[mod["symbol"]]
                    try:
                        seq[aa_pos] = new_aa
                    except IndexError:
                        if row["id"] in ["P03694", "P06890"]:
                            continue
                        raise IndexError
                seq = "".join(seq)
                df.iloc[idx]["sequence"] = seq
        df.drop(columns=["sequenceModifications"], axis=1, inplace=True)
        return df

    def fill_data(self, root: ET.Element) -> None:
        # TODO: dial with root
        all_data = {tag: [] for tag in self.tags}
        for idx, entry in enumerate(root):
            entry_data = self._extract_entry_info(entry)
            for tag, value in entry_data.items():
                all_data[tag].append(value)
        df = pd.DataFrame(all_data)
        df = self._adapt_sequences(df)
        df = df[df["sequence"].notnull()]
        self.df = df


def extract_seq_mod(seq_mods_path: Path) -> dict[str, str]:
    df_mod = pd.read_csv(seq_mods_path)
    df_mod["canonical_aa"].replace("<empty>", "", inplace=True)
    df_mod["canonical_aa"].replace(pd.NA, "X", inplace=True)
    mod_dict = df_mod.set_index("symbol").to_dict()["canonical_aa"]
    return mod_dict


def df2fasta(df, fasta_path: str, header_col: str, seq_col: str) -> None:
    with open(fasta_path, "w") as out:
        for idx, row in df.iterrows():
            out.write(f">{row['id']}\n")
            out.write(f"{row['sequence']}\n")


seq_mod_dict = extract_seq_mod(seq_mods_path2)
parser = ParseConoserverXML(xml_path=xml_new, tags=tags, seq_mod_dict=seq_mod_dict)
parser.fill_data(root=root)
df2fasta(df=parser.df, fasta_path=fasta_path, header_col="id", seq_col="sequence")

In [11]:
df = parser.df
df = df.drop(
    columns=[
        "alternativeNames",
        "averageMass",
        "name",
        "nucleicAcid",
        # "sequence",
        "sequenceRegions",  # TODO: to implement
    ],
    axis=1,
    # inplace=True,
)
# integers not working yet
# df.drop(
#     columns=[
#         "extinctionCoefficient",
#         "isoelecticPoint",
#         "monoisotopicMass",
#     ],
#     axis=1,
#     inplace=True,
# )
id_col = df.pop("id")
df.insert(0, "id", id_col)
df.to_csv(feature_csv_path, index=False)

In [12]:
# pharmacologicalFamily only active part
sub_base = Path("pharmaFam")
sub_fasta = sub_base / "pharmaFam.fasta"
sub_feature = sub_base / "pharmaFam.csv"
subsection = ["alpha conotoxin", "omega conotoxin", "mu conotoxin", "delta conotoxin"]
df_sub = df[df["pharmacologicalFamily"].isin(subsection)]
df_sub.reset_index(drop=True, inplace=True)
id_col = df_sub.pop("id")
df_sub.insert(0, "id", id_col)
df2fasta(df=df_sub, fasta_path=sub_fasta, header_col="id", seq_col="sequence")
df_sub = df_sub.drop(columns=["sequence"], axis=1)
df_sub.to_csv(sub_feature, index=False)

## Get signale, mature, and full peptide/protein

In [13]:
import numpy as np

# get full sequence (aka. signal peptide + active site)
dict_prot2mature = {}
signal_seqs = {}
mature_seqs = {}
df = parser.df

data = dict()
for idx, row in df.iterrows():
    seq_regs = row["sequenceRegions"]
    if isinstance(seq_regs, list):  # pd.isna(seq_regs):
        cur_data = {
            key: pd.NA
            for key in [
                "id",
                "full_seq",
                "signal_seq",
                "mature_seq",
                "mature_id",
                "mature_id_seq",
                "pharmacologicalFamily",
                "cysteineFramewrok",
            ]
        }
        uid = row["id"]
        seq = row["sequence"]
        cur_data["full_seq"] = seq
        for seq_reg in seq_regs:
            cur_data["id"] = uid
            start, end = int(seq_reg["start"]) - 1, int(seq_reg["end"])
            sub_seq = seq[start:end]
            if seq_reg["type"] == "signal sequence":
                cur_data["signal_seq"] = sub_seq
                # signal_seqs[uid] = sub_seq
            if seq_reg["type"] == "mature peptide":
                cur_data["mature_seq"] = sub_seq
                mature_prot_id = seq_reg.get("regionId", "")
                cur_data["mature_id"] = mature_prot_id
                mature_id_entry = df[df["id"] == mature_prot_id]
                # ignore data that has more than 1 mature/active peptide
                if len(mature_id_entry) != 1:
                    continue
                cur_data["mature_id_seq"] = mature_id_entry.iloc[0]["sequence"]
                cur_data["pharmacologicalFamily"] = mature_id_entry.iloc[0][
                    "pharmacologicalFamily"
                ]
                cur_data["cysteineFramewrok"] = mature_id_entry.iloc[0][
                    "cysteineFramewrok"
                ]
        for key, value in cur_data.items():
            data.setdefault(key, list()).append(value)
# df = df.drop(
#     columns=[
#         "alternativeNames",
#         "averageMass",
#         "name",
#         "nucleicAcid",
#         "sequence",
#         "sequenceRegions",
#     ],
#     axis=1,
# )

In [36]:
df_sub = pd.DataFrame(data)
# remove sequences that havo no "mature_seq" and "mature_id"
df_sub = df_sub[df_sub["mature_id"].notna()]

# some entries miss one of: "signal_seq", "mature_id_seq", or "pharmacologicalFamily"
print(df_sub.info())


base = Path(".")
for seq in ["full_seq", "signal_seq", "mature_seq", "mature_id_seq"]:
    dset_base = base / seq
    fasta = dset_base / f"{seq}.fasta"
    csv = dset_base / f"{seq}.csv"
    if not dset_base.is_dir():
        print(seq)
        dset_base.mkdir()
    with open(fasta, "w") as out:
        for idx, row in df_sub.iterrows():
            out.write(f">{row['id']}\n")
            out.write(f"{row[seq]}\n")
# df_m = df_m[
#     df_m["mature_id_seq"].notna() & (df_m["mature_id_seq"] != df_m["mature_seq"])
# ]
# df_m.apply(
#     lambda row: sum(a != b for a, b in zip(row["mature_seq"], row["mature_id_seq"])),
#     axis=1,
# )

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2833 entries, 0 to 2910
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     2833 non-null   object
 1   full_seq               2833 non-null   object
 2   signal_seq             2538 non-null   object
 3   mature_seq             2833 non-null   object
 4   mature_id              2833 non-null   object
 5   mature_id_seq          2807 non-null   object
 6   pharmacologicalFamily  153 non-null    object
dtypes: object(7)
memory usage: 177.1+ KB
None
full_seq
signal_seq
mature_seq
mature_id_seq


In [32]:
df_sub = pd.DataFrame(data)
# remove sequences that havo no "mature_seq" and "mature_id"
df_sub = df_sub[df_sub["mature_id"].notna()]

df_sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2833 entries, 0 to 2910
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     2833 non-null   object
 1   full_seq               2833 non-null   object
 2   signal_seq             2538 non-null   object
 3   mature_seq             2833 non-null   object
 4   mature_id              2833 non-null   object
 5   mature_id_seq          2807 non-null   object
 6   pharmacologicalFamily  153 non-null    object
dtypes: object(7)
memory usage: 177.1+ KB


In [27]:
df_m["mature_id"].isna().sum()

78

In [15]:
# save signal peptide
name = "signal"
base_path = Path(name)
if not base_path.is_dir():
    base_path.mkdir()
fasta_path = base_path / f"{name}.fasta"
csv_path = base_path / f"{name}.csv"

with open(fasta_path, "w") as out:
    for uid, signal_seq in signal_seqs.items():
        out.write(f">{uid}\n")
        out.write(f"{signal_seq}\n")

df_tmp = df[df["id"].isin(list(signal_seqs.keys()))]
id_col = df_tmp.pop("id")
df_tmp.insert(0, "id", id_col)
df_tmp.to_csv(csv_path, index=False)

In [16]:
# save mature peptide
name = "mature"
base_path = Path(name)
if not base_path.is_dir():
    base_path.mkdir()
fasta_path = base_path / f"{name}.fasta"
csv_path = base_path / f"{name}.csv"

uids = []
with open(fasta_path, "w") as out:
    for uid, mature_seq in mature_seqs.items():
        full_prot_id, mature_id = uid.split("_")
        uids.append(mature_id)
        out.write(f">{full_prot_id}\n")
        out.write(f"{mature_seq}\n")

df_tmp = df[df["id"].isin(uids)]
id_col = df_tmp.pop("id")
df_tmp.insert(0, "id", id_col)
df_tmp.to_csv(csv_path, index=False)

In [20]:
uids

[]

## TODO
- check if parentID tree can be constructed

In [137]:
print(sorted(list(df["parentID"].value_counts().index)))

['P00001', 'P00004', 'P00010', 'P00015', 'P00023', 'P00026', 'P00032', 'P00039', 'P00050', 'P00051', 'P00074', 'P00095', 'P00098', 'P00099', 'P00405', 'P00595', 'P00840', 'P01309', 'P01338', 'P01356', 'P01372', 'P01384', 'P01386', 'P01397', 'P01537', 'P01546', 'P01556', 'P01571', 'P01594', 'P01611', 'P01615', 'P01616', 'P01634', 'P01737', 'P01793', 'P02228', 'P02485', 'P02520', 'P02539', 'P02549', 'P02576', 'P02585', 'P02586', 'P02587', 'P02588', 'P02591', 'P02594', 'P02608', 'P02665', 'P02707', 'P02752', 'P02835', 'P02881', 'P02901', 'P03625', 'P03641', 'P03752', 'P03901', 'P03906', 'P04480', 'P04622', 'P04661', 'P05344', 'P05345', 'P05840', 'P05841', 'P05949', 'P06614', 'P06615', 'P06732', 'P06776', 'P06810', 'P06814', 'P06822', 'P06831', 'P06858', 'P06897', 'P06934', 'P06935', 'P07401', 'P07403', 'P07410', 'P07434', 'P07466', 'P07480', 'P07497', 'P07646', 'P08412', 'P08506', 'P08508', 'P08686', 'P08893', 'P08896', 'P08909', 'P08991', 'P09000', 'P09015', 'P10267', 'P10407', 'P10418',