In [1]:

import os
import torch 

from functools import partial
from graphein.ml.conversion import GraphFormatConvertor
from graphein.ml import ProteinGraphDataset
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import add_distance_threshold
from graphein.protein.features.nodes.amino_acid import (amino_acid_one_hot,
                                                        meiler_embedding)

params_to_change = {"granularity": "centroids", 
     "node_metadata_functions": [amino_acid_one_hot, meiler_embedding],
     "edge_construction_functions": [partial(add_distance_threshold, long_interaction_threshold=0, threshold=15)]}

config = ProteinGraphConfig(**params_to_change)

graph_format_convertor = GraphFormatConvertor(
        src_format="nx", dst_format="pyg",
        columns = [
                "edge_index",
                "coords",
                "dist_mat",
                "name",
                "node_id",
                "amino_acid_one_hot",
                "meiler"
            ]
)


# local_dir = "./dataset/pdb/"
local_dir = "/deeplearning/data/uniprot/selected_alphafold2"
pdb_paths = [os.path.join(local_dir, pdb_path) for pdb_path in os.listdir(local_dir) if pdb_path.endswith(".pdb")]



ds = ProteinGraphDataset(
    # root = "./dataset/pdb/",
    root = "./dataset/af2/",
    pdb_paths = pdb_paths,
    graphein_config=config,
    graph_format_convertor=graph_format_convertor,
    num_cores = 32
)



  from .autonotebook import tqdm as notebook_tqdm
To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
To use the Graphein submodule graphein.protein.visualisation, you need to install: pytorch3d 
To do so, use the following command: conda install -c pytorch3d pytorch3d


we could plot it 

In [2]:
# from graphein.ml.visualisation import plot_pyg_data

# p = plot_pyg_data(ds[0],
#     colour_nodes_by="degree",
#     label_node_ids=False,
#     plot_title="Peptide backbone graph. Nodes coloured by degree.",
#     node_size_multiplier=1
#     )
# p.show()



load nsSNP

In [3]:

import pandas as pd 
pd.set_option('display.max_columns', 100)

nsSNP_df = pd.read_csv("../data/filter_data/statistic3d/clinvar_clean_for_statistic.csv", sep ="\t")
nsSNP_df.dropna(subset=["uniprot accession"], inplace=True)
nsSNP_df = nsSNP_df.sort_values("uniprot accession")




  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# nsSNP_df.head()

In [5]:
import re 
af2_root_path = "dataset/af2/processed/"  # 读入预先处理后得到的af2 Graph
af2_structure_dict = {re.search(r"(?<=AF-)[^-]*(?=-)", i).group() : os.path.join(af2_root_path, i) for i in os.listdir(af2_root_path) if os.path.splitext(i)[-1] == ".pt" and "pre" not in i}  # 识别的格式命名


In [6]:

uacc = "P29016"

uacc_df = nsSNP_df[nsSNP_df["uniprot accession"] == uacc]

uacc_wildType_graph = torch.load(af2_structure_dict[uacc])
# uacc_df["AAS"]

In [7]:
uacc_wildType_graph["node_id"]

['A:MET:1',
 'A:LEU:2',
 'A:LEU:3',
 'A:LEU:4',
 'A:PRO:5',
 'A:PHE:6',
 'A:GLN:7',
 'A:LEU:8',
 'A:LEU:9',
 'A:ALA:10',
 'A:VAL:11',
 'A:LEU:12',
 'A:PHE:13',
 'A:PRO:14',
 'A:GLY:15',
 'A:GLY:16',
 'A:ASN:17',
 'A:SER:18',
 'A:GLU:19',
 'A:HIS:20',
 'A:ALA:21',
 'A:PHE:22',
 'A:GLN:23',
 'A:GLY:24',
 'A:PRO:25',
 'A:THR:26',
 'A:SER:27',
 'A:PHE:28',
 'A:HIS:29',
 'A:VAL:30',
 'A:ILE:31',
 'A:GLN:32',
 'A:THR:33',
 'A:SER:34',
 'A:SER:35',
 'A:PHE:36',
 'A:THR:37',
 'A:ASN:38',
 'A:SER:39',
 'A:THR:40',
 'A:TRP:41',
 'A:ALA:42',
 'A:GLN:43',
 'A:THR:44',
 'A:GLN:45',
 'A:GLY:46',
 'A:SER:47',
 'A:GLY:48',
 'A:TRP:49',
 'A:LEU:50',
 'A:ASP:51',
 'A:ASP:52',
 'A:LEU:53',
 'A:GLN:54',
 'A:ILE:55',
 'A:HIS:56',
 'A:GLY:57',
 'A:TRP:58',
 'A:ASP:59',
 'A:SER:60',
 'A:ASP:61',
 'A:SER:62',
 'A:GLY:63',
 'A:THR:64',
 'A:ALA:65',
 'A:ILE:66',
 'A:PHE:67',
 'A:LEU:68',
 'A:LYS:69',
 'A:PRO:70',
 'A:TRP:71',
 'A:SER:72',
 'A:LYS:73',
 'A:GLY:74',
 'A:ASN:75',
 'A:PHE:76',
 'A:SER:77',
 'A:ASP:

In [8]:
from Gandalf.utils import get_uacc_from_af2

get_uacc_from_af2(uacc_wildType_graph.name[0])

'P29016'

In [9]:

from Gandalf.mutant import generate_nsSNP_pyg
from graphein.protein.features.nodes.amino_acid import amino_acid_one_hot, meiler_embedding

test = generate_nsSNP_pyg(uacc_wildType_graph, SNP="MET1VAL", node_metadata_functions = {"amino_acid_one_hot":amino_acid_one_hot, "meiler":meiler_embedding})
test 

Data(
  edge_index=[2, 6716],
  node_id=[333],
  coords=[1],
  amino_acid_one_hot={ A=[333, 20] },
  meiler={ A=[333, 7] },
  name=[1],
  dist_mat=[1],
  num_nodes=333,
  graph_name=[1],
  mutation_masked_tensor=[1]
)

In [1]:
from graphein.protein.visualisation import plotly_protein_structure_graph
import graphein.protein as gp
from graphein.protein.graphs import construct_graph
import os 
from graphein.protein.config import ProteinGraphConfig
from graphein.ml.conversion import GraphFormatConvertor

from Gandalf.mutant import generate_nsSNP_pyg
from graphein.protein.features.nodes.amino_acid import amino_acid_one_hot, meiler_embedding

g = construct_graph(config=ProteinGraphConfig(),
                     pdb_path="/deeplearning/GNN/ipynb_tutorials/graphein/pdb/ranked_0.pdb")
graph_format_convertor = GraphFormatConvertor(src_format="nx", dst_format="pyg")
pyg_g = graph_format_convertor(g)

multiple_chain = generate_nsSNP_pyg(pyg_g, SNP={"A":"MET1VAL", "B":"MET1VAL"}, graph_label = {"A":1, "B":0}, node_metadata_functions = {"amino_acid_one_hot":amino_acid_one_hot, "meiler":meiler_embedding})
multiple_chain 

  from .autonotebook import tqdm as notebook_tqdm
To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
To use the Graphein submodule graphein.protein.visualisation, you need to install: pytorch3d 
To do so, use the following command: conda install -c pytorch3d pytorch3d


Data(
  edge_index=[2, 1350],
  node_id=[1355],
  coords=[1],
  name=[1],
  dist_mat=[1],
  num_nodes=1355,
  mutation_chain_dict={
    A=[271],
    B=[271],
    C=[271],
    D=[271],
    E=[271]
  },
  graph_name=[1],
  mutation_masked_0_based_pos=[5],
  graph_label={
    A=1,
    B=0,
    C=None,
    D=None,
    E=None
  },
  amino_acid_one_hot=[5],
  meiler=[5]
)

In [2]:
multiple_chain["graph_label"]

{'A': 1, 'B': 0, 'C': None, 'D': None, 'E': None}

In [11]:
# groupby = nsSNP_df.groupby("uniprot accession", as_index=False)

# x = torch.load(list(af2_structure_dict.values())[0])
# x.node_id