In [None]:
import gzip

from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
ppb=PPBuilder()
def getSeq(pdb):
    for pp in ppb.build_peptides(pdb):
        yield str(pp.get_sequence())

import networkx as nx

edgotype = nx.read_gexf("data/y2hEdgotyping/edgotype.gefx")

import os
import pandas as pd
seqFiles = [pd.read_csv(f"data/y2hEdgotyping/uniprotScan/sequence_{i}.tsv",delimiter="\t") for i in range(6)]

uniprotMatches = pd.concat(seqFiles)

import Bio
import Bio.PDB
import Bio.SeqRecord

pdbparser = Bio.PDB.PDBParser(QUIET=False,)   # suppress PDBConstructionWarning
from tqdm import tqdm
def mergeWithUniprot(graph):
    for node in tqdm(graph.nodes(data=True),total=len(graph.nodes())):
        seq = node[1]["seq"]
        up = uniprotMatches[(uniprotMatches.Sequence == seq) & \
                            (uniprotMatches.Reviewed == "reviewed") & \
                           (uniprotMatches.Organism == "Homo sapiens (Human)")]
        graph.nodes[node[0]]["uniprotMatches"] = up
        alphafoldStructures = []
        for uniprot_id in graph.nodes[node[0]]["uniprotMatches"]["Entry"]:
            fp = f"/data/dzeiberg/alphafold/predictions/AF-{uniprot_id}-F1-model_v4.pdb.gz"
            if os.path.isfile(fp):
                with gzip.open(fp,"rt",encoding='utf-8') as gz:
                    struc = pdbparser.get_structure(fp,gz)
                alphafoldStructures.append(struc)
        graph.nodes[node[0]]["alphafoldStructures"] = alphafoldStructures
        
    return graph

edgotype = mergeWithUniprot(edgotype)

In [None]:
nodes = list(edgotype.nodes(data=True))

In [None]:
from itertools import chain

In [None]:
badNodes = []
goodNodes = []
for ensg,node in nodes:
    if not len(node["alphafoldStructures"]):
        badNodes.append(ensg)
        print(f"no prediction for {ensg}")
        continue
    if node["seq"] not in list(chain(*[list(getSeq(s)) for s in node["alphafoldStructures"]])):
        badNodes.append(ensg)
        print(ensg)
    else:
        goodNodes.append(ensg)

In [None]:
len(goodNodes)

In [None]:
len(nodes)

In [None]:
len(badNodes)

# many nodes didn't have an AlphafoldKB match, but only 1 had a misalligned sequence

In [None]:
len(badNodes)

In [None]:
nomatchseqs = []
matchnodes = []
for ensg in badNodes:
    n = edgotype.nodes(data=True)[ensg]
    if not len(n["uniprotMatches"]):
        nomatchseqs.append((ensg,n["seq"]))
    else:
        matchnodes.append(ensg)

In [None]:
len(nomatchseqs)

In [None]:
nomatchseqs[0]

In [None]:
# for tup in nomatchseqs[:100]:
#     print(tup[1])

In [None]:
# for tup in nomatchseqs[100:]:
#     print(tup[1])

In [None]:
len(badNodes),len(nomatchseqs),len(matchnodes)

In [None]:
for m in matchnodes:
    node = edgotype.nodes(data=True)[m]
    print(m, node["uniprotMatches"].Entry.values[0],node["uniprotMatches"].AlphaFoldDB.values)

In [None]:
edgotype.nodes(data=True)["ENSG00000185303"]["alphafoldStructures"]

In [None]:
list(getSeq(edgotype.nodes(data=True)["ENSG00000185303"]["alphafoldStructures"][0]))

In [None]:
edgotype.nodes(data=True)["ENSG00000185303"]["seq"]

In [None]:
def addSubs(graph):
    for n in graph.nodes():
        edges = graph.edges(n,data=True)
        db_edges = [e for e in edges if e[2]["db_ensembl_gene_id_mt"] == n]
        subs = list(set([e[2]["Substitution"] for e in db_edges]))
        nx.set_node_attributes(graph,{n: {"subs":subs}})
    return graph

In [None]:
edgotype = addSubs(edgotype)

In [None]:
nodes = list(edgotype.nodes(data=True))

In [None]:
len(nodes[0][1]["subs"])

In [None]:
for n in nodes:
    match = np.ones(len(n[1]["subs"])).astype(bool)
    for i,s in enumerate(n[1]["subs"]):
        og,loc,var = s[0],int(s[1:-1])-1, s[-1]
        if n[1]["seq"][loc] != og:
            match[i] = False
    nx.set_node_attributes(edgotype,{n[0]: {"match":match}})

In [None]:
for ensg,n in edgotype.nodes(data=True):
    if len(n["match"]):
        print(n["match"].sum(), len(n["match"]))

In [None]:
seq = next(iter(edgotype.nodes(data=True)))[1]["seq"]

In [None]:
seq