In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/data/dzeiberg/stability/skempi_v2.csv",delimiter=";")

In [None]:
df

In [None]:
skempi_pdb = set([v[:v.find("_")] if "_" in v else v for v in df["#Pdb"].values])

In [None]:
len(set(",".join(df["Mutation(s)_cleaned"].values).split(",")))

In [None]:
import networkx as nx

edgotype = nx.read_gexf("data/y2hEdgotyping/edgotype.gefx")

In [None]:
import os
import pandas as pd
seqFiles = [pd.read_csv(f"data/y2hEdgotyping/uniprotScan/sequence_{i}.tsv",delimiter="\t") for i in range(6)]

uniprotMatches = pd.concat(seqFiles)
def mergeWithUniprot(graph):
    for node in graph.nodes(data=True):
        seq = node[1]["seq"]
        up = uniprotMatches[(uniprotMatches.Sequence == seq) & \
                            (uniprotMatches.Reviewed == "reviewed") & \
                           (uniprotMatches.Organism == "Homo sapiens (Human)")]
        graph.nodes[node[0]]["uniprotMatches"] = up
        alphafoldStructures = []
        for uniprot_id in graph.nodes[node[0]]["uniprotMatches"]["Entry"]:
            fp = f"/data/dzeiberg/alphafold/predictions/AF-{uniprot_id}-F1-model_v4.pdb.gz"
            if os.path.isfile(fp):
                alphafoldStructures.append(fp)
        graph.nodes[node[0]]["alphafoldStructures"] = alphafoldStructures
    return graph

In [None]:
edgotype_x = mergeWithUniprot(edgotype)

In [None]:
import itertools

In [None]:
edgotype_pdb = set(";".join(pd.concat([n[1]["uniprotMatches"].PDB for n in edgotype_x.nodes(data=True)]).dropna()).split(";"))

In [None]:
nodeInt = {}
for node_id, n in edgotype_x.nodes(data=True):
    if len(n["uniprotMatches"]):
        pdbn = n["uniprotMatches"]["PDB"].values[0]
        if type(pdbn) is str:
            pdb_n = set(pdbn.split(";"))
            if len(skempi_pdb.intersection(pdb_n)):
                subs = [e["Substitution"] for _,_,e in edgotype_x.edges(node_id,
                                                                     data=True)]
                nodeInt[node_id] = {"pdb":pdb_n,"subs":subs}

In [None]:
skempiSub = edgotype_x.subgraph(list(nodeInt.keys()) + list(itertools.chain.from_iterable([edgotype_x.neighbors(i) for i in nodeInt.keys()])))

In [None]:
for k,v in nodeInt.items():
    print(k,len(v["pdb"]),len(v["subs"]))

In [None]:
nodeInt["ENSG00000150337"]

# Interactome Insider Overlap

In [None]:
ii = pd.read_csv("/data/dzeiberg/interactomeInsider/H_sapiens_interfacesHQ.txt",delimiter="\t")

In [None]:
ii.Source.unique()

In [None]:
ii = ii[ii.Source.isin(["PDB","I3D"])]

In [None]:
ens2uni = {}
for ensg, n in edgotype_x.nodes(data=True):
    ens2uni[ensg] = n["uniprotMatches"].Entry.values

In [None]:
uniEdges = []
for i,j in edgotype_x.edges():
    ui = ens2uni[i]
    uj = ens2uni[j]
    if len(ui) and len(uj):
        uniEdges.append([ui[0],uj[0],i,j])

In [None]:
su = set([tuple(r[:2]) for r in np.array(uniEdges)])

sii = set([tuple(r) for r in ii[["P1","P2"]].values])

In [None]:
su.intersection(sii)

In [None]:
subgraphNodes = set.union(*[set(tuple(u[2:])) for u in uniEdges if tuple(u[:2]) in sii])

In [None]:
ii_subgraph = edgotype_x.subgraph(subgraphNodes)

In [None]:
len(ii_subgraph.edges())

In [None]:
edgedf = []
for i,j,e in ii_subgraph.edges(data=True):
    edgedf.append({"Pi":ens2uni[e["db_ensembl_gene_id_mt"]][0],
                   "Pj": ens2uni[e["ad_ensembl_gene_id_mt"]][0],
                   "edge": e})

In [None]:
for i in range(len(edgedf)):
    pi = edgedf[i]["Pi"]
    pj = edgedf[i]["Pj"]
    iisub = ii[((ii.P1 == pi) & (ii.P2 == pj)) | ((ii.P1 == pj) & (ii.P2 == pi))]
    if len(iisub):
        edgedf[i]["ii_matches"] = iisub
    else:
        edgedf[i]["ii_matches"] = pd.DataFrame()

In [None]:
inside = []
outside = []
for e in edgedf:
    loc = e["edge"]["aa_change_mt"][3:-3]
    if len(e["ii_matches"]):
        p1r = e["ii_matches"].iloc[0]["P1_IRES"][1:-1].replace("-",",").split(",")
        p2r = e["ii_matches"].iloc[0]["P2_IRES"][1:-1].replace("-",",").split(",")
#         print(loc,p1r,p2r)
        if ((e["Pi"] == e["ii_matches"].iloc[0]["P1"] and loc in p1r) or \
                                 (e["Pi"] == e["ii_matches"].iloc[0]["P2"] and loc in p2r)):
            inside.append(e)
        else:
            outside.append(e)

In [None]:
len(inside)

In [None]:
len(outside)


# Findings

242 instances in the edgotype data have matches in Interactome Insider High Quality dataset

21 of the 242 are comprised of a mutation inside db protein's predicted interacting residue range

In [None]:
def calc_score(e):
    score = 0
    for med in ["LWH1_f_","LWH10_f_", "LWH25_f_",
                "LWA_f_","LWAH1_f_"]:
#         score += int(e[med+"mt"] + 2 <= e[med+"wt"])
        score += e[med + "wt"] - e[med + "mt"]
    return score

In [None]:
insideScores = [calc_score(e["edge"]) for e in inside]
outsideScores = [calc_score(e["edge"]) for e in outside]

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig,ax = plt.subplots(1,2,sharey=True)
ax[0].hist(insideScores,density=True)
ax[0].set_title("At Predicted Binding Residue")
ax[1].hist(outsideScores,density=True)
_ = ax[1].set_title("Not at Predicted Binding Residue")
# ax[0].set_xticks(np.arange(0,6))
# _ = ax[1].set_xticks(np.arange(0,6))
ax[0].set_xlim(-20,20)
ax[1].set_xlim(-20,20)