In [None]:
import pandas as pd

In [None]:
seqFiles = [pd.read_csv(f"data/y2hEdgotyping/uniprotScan/sequence_{i}.tsv",delimiter="\t") for i in range(6)]

In [None]:
uniprotMatches = pd.concat(seqFiles)

In [None]:
uniprotMatches.Sequence.unique().shape

In [None]:
import networkx as nx

In [None]:
edgotype_train = nx.read_gpickle("data/y2hEdgotyping/edgotype_train.gpickle")
edgotype_val = nx.read_gpickle("data/y2hEdgotyping/edgotype_val.gpickle")
edgotype_test = nx.read_gpickle("data/y2hEdgotyping/edgotype_test.gpickle")

In [None]:
import os

def mergeWithUniprot(graph):
    for node in graph.nodes(data=True):
        seq = node[1]["seq"]
        up = uniprotMatches[(uniprotMatches.Sequence == seq) & \
                            (uniprotMatches.Reviewed == "reviewed") & \
                           (uniprotMatches.Organism == "Homo sapiens (Human)")]
        graph.nodes[node[0]]["uniprotMatches"] = up
        alphafoldStructures = []
        for uniprot_id in graph.nodes[node[0]]["uniprotMatches"]["Entry"]:
            fp = f"/data/dzeiberg/alphafold/predictions/AF-{uniprot_id}-F1-model_v4.pdb.gz"
            if os.path.isfile(fp):
                alphafoldStructures.append(fp)
        graph.nodes[node[0]]["alphafoldStructures"] = alphafoldStructures
    return graph

In [None]:
edgotype_train = mergeWithUniprot(edgotype_train)
edgotype_val = mergeWithUniprot(edgotype_val)
edgotype_test = mergeWithUniprot(edgotype_test)

# Analyze Hit Rate of Edgotype Nodes with Uniprot

In [None]:
import numpy as np

## Train Uniprot Matches

In [None]:
np.unique([n[1]["uniprotMatches"].shape[0] for n in edgotype_train.nodes(data=True)],
          return_counts=True)

## Train Alphafold Matches

In [None]:
np.unique([len(n[1]["alphafoldStructures"]) for n in edgotype_train.nodes(data=True)],
          return_counts=True)

## Val Uniprot Matches

In [None]:
np.unique([n[1]["uniprotMatches"].shape[0] for n in edgotype_val.nodes(data=True)],
          return_counts=True)

## Val Alphafold Matches

In [None]:
np.unique([len(n[1]["alphafoldStructures"]) for n in edgotype_val.nodes(data=True)],
          return_counts=True)

In [None]:
for id_,n in edgotype_val.nodes(data=True):
    if not len(n["alphafoldStructures"]) and (len(n["uniprotMatches"]) and not np.all(n["uniprotMatches"].PDB.isna())):
        print("found")
        break

In [None]:
for n_id,n in edgotype_train.nodes(data=True):
    if n["uniprotMatches"].shape[0] == 0:
        print(n_id)

In [None]:
for n_id,n in edgotype_val.nodes(data=True):
    if n["uniprotMatches"].shape[0] == 0:
        print(n_id)

In [None]:
nx.write_gpickle(edgotype_train,"data/y2hEdgotyping/edgotype_train.gpickle")
nx.write_gpickle(edgotype_val,"data/y2hEdgotyping/edgotype_val.gpickle")
nx.write_gpickle(edgotype_test,"data/y2hEdgotyping/edgotype_test.gpickle")

In [None]:
next(iter(edgotype_test.edges(data=True)))

In [None]:
next(iter(edgotype_test.nodes(data=True)))