In [None]:
import glypy
import networkx as nx
import matplotlib.pyplot as plt
monosaccharides = glypy.monosaccharides

glcnac1 = monosaccharides["GlcNAc"]
glcnac2 = monosaccharides["GlcNAc"]

glcnac1.add_monosaccharide(glcnac2, position=4, child_position=1)
bdman = monosaccharides["bdMan"]
glcnac2.add_monosaccharide(bdman, position=4, child_position=1)
adman1 = monosaccharides["adMan"]
bdman.add_monosaccharide(adman1, position=3, child_position=1)
adman2 = monosaccharides["adMan"]
bdman.add_monosaccharide(adman2, position=6, child_position=1)

n_linked_core = glypy.Glycan(root=glcnac1)
n_linked_core.reindex()
n_linked_core.canonicalize()
print(n_linked_core)
print(n_linked_core.mass())
print(n_linked_core.total_composition())

for link, linkage in n_linked_core.iterlinks():
    print("linkage" , linkage.parent.id, linkage.child.id)
    
for node in n_linked_core.iternodes():
    print(node.id, node.serialize("IUPAC"))


def get_saccharide_saccharide_link_properties(parent_node, child_node, link):
    attributes = {"parent_anomer" : str(parent_node.anomer),
                  "child_anomer" : str(child_node.anomer),
                  "linkage_type" : "saccharide",
                  "constant" : "constant"}
    return attributes

def get_saccharide_substituent_link_properties(saccharide, substituent, link):
    attributes = {"parent_position" : link.parent_position,
              "child_position" : link.child_position,
              "child_anomer" : str(saccharide.anomer),
              "linkage_type" : "substituent",
              "constant" : "constant"}
    return attributes
    

def nx_glycan_graph(glypy_glycan):
    glypy_glycan = glypy_glycan.canonicalize()
    
    G = nx.Graph()
    # Iterate over the glycan structure
    for node in glypy_glycan.iternodes():
        # Add each monosaccharide as a node
        for idx, substituent in node.substituents():
            G.add_node(substituent.id, composition = substituent.composition, node_type = "substituent", constant = "constant")
        G.add_node(node.id, composition = node.serialize(name='IUPAC'), node_type = "saccharide", constant = "constant")


    for link, linkage in glypy_glycan.iterlinks(substituents = True):
        parent = linkage.parent
        child = linkage.child
        if isinstance(parent,glypy.structure.monosaccharide.Monosaccharide) and isinstance(child,glypy.structure.monosaccharide.Monosaccharide):
            attributes = get_saccharide_saccharide_link_properties(parent, child, linkage)
        elif isinstance(parent,glypy.structure.substituent.Substituent):
            attributes = get_saccharide_substituent_link_properties(child, parent, linkage)
        elif isinstance(child,glypy.structure.substituent.Substituent):
            attributes = get_saccharide_substituent_link_properties(parent, child, linkage)
        
        G.add_edge(linkage.parent.id, linkage.child.id, **attributes)

        
    return(G)
    

g = nx_glycan_graph(n_linked_core)    
# Get labels from the graph
labels = nx.get_node_attributes(g, 'composition')

# Draw the graph with labels
pos = nx.spring_layout(g)
colors = []
for node in g.nodes:
    if g.nodes[node]['node_type'] == 'saccharide':
        colors.append('red')
    elif g.nodes[node]['node_type'] == 'substituent':
        colors.append('blue')
    else:
        colors.append('green')

# Draw the graph with the color map
nx.draw(g, node_color=colors, with_labels=True)

# Draw edge labels
edge_labels = nx.get_edge_attributes(g, 'constant')
nx.draw_networkx_edge_labels(g, pos, edge_labels=edge_labels)

# Show the graph
plt.show()

glycosmos_glycans = pd.read_csv("glycosmos_glycans_wurcs.csv")

from glypy.io import wurcs
wurcs_struct = wurcs.loads(glycosmos_glycans.loc[0].WURCS)

In [None]:
glycosmos_glycans_unique = glycosmos_glycans.drop_duplicates(subset=["WURCS"])

In [None]:
from urllib.parse import quote
import json

# create empty lists to hold the GlyTouCan IDs and GlycoCT strings
glytoucan_ids = []
glycoct_strings = []
url = "https://api.glycosmos.org/glycanformatconverter/2.8.2/wurcs2glycoct"

#glycoct_strings = []  # Create a new list to hold the GlycoCT strings

with Progress() as progress: #transient=True

    convert_wurcs = progress.add_task("[red]Converting WURCS strings to GlycoCT...", total=len(range(0, len(glycosmos_glycans_unique['WURCS']), n)) +1)  
    
    while not progress.finished:
        try:
            with open('glycoct_strings.pkl', 'rb') as f:
                responses = pickle.load(f)
            progress.update(convert_wurcs, advance=len(range(0, len(glycosmos_glycans_unique['WURCS']), n)) +1)
            time.sleep(0.1)
        except:
            print("in the exception")
            for wurcs_string in glycosmos_glycans_unique['WURCS']:
                data = {"input":wurcs_string}
                headers = {'Content-Type': 'application/json'}

                response = requests.post(url, headers=headers, data=json.dumps(data))

                if response.status_code == 200:
                    response_data = response.json()
                    if 'message' in response_data and response_data['message'] == 'Returned null.':
                        #print("The server returned null. It may not have found the data you were looking for.")
                        glycoct_strings.append(None)
                    else:
                        glycoct_strings.append(response_data['GlycoCT'])

                else:
                    print(f"Request failed with status code {response.status_code}")
                    glycoct_strings.append(None)  # Append None or some default value if the request failed
                progress.update(convert_wurcs ,advance = 1)

            with open('glycoct_strings.pkl', 'wb') as f:
                pickle.dump(glycoct_strings, f)
            
# After the loop, add the list as a new column to your DataFrame
glycosmos_glycans_unique['GlycoCT'] = glycoct_strings

In [None]:
glycoct_strings = ["""RES
1b:b-dglc-HEX-1:5
2s:n-acetyl
3b:b-dglc-HEX-1:5
4s:n-acetyl
LIN
1:1d(2+1)2n
2:1o(4+1)3d
3:3d(2+1)4n""",
                  """RES
1b:b-dglc-HEX-1:5
2s:n-acetyl
3b:b-dglc-HEX-1:5
4s:n-acetyl
5b:b-dman-HEX-1:5
6b:a-dman-HEX-1:5
7b:b-dglc-HEX-1:5
8s:n-acetyl
9b:a-dgal-HEX-1:5
10s:n-acetyl
11s:sulfate
12b:b-lgal-HEX-1:5|6:d
LIN
1:1d(2+1)2n
2:1o(4+1)3d
3:3d(2+1)4n
4:3o(4+1)5d
5:5o(3+1)6d
6:6o(4+1)7d
7:7d(2+1)8n
8:7o(4+1)9d
9:9d(2+1)10n
10:9o(4+1)11n
11:1o(6+1)12d
""",
              """RES
1b:b-dglc-HEX-1:5
2s:n-acetyl
3b:b-dglc-HEX-1:5
4s:n-acetyl
5b:b-dman-HEX-1:5
6b:a-dman-HEX-1:5
7b:a-dman-HEX-1:5
8b:a-dman-HEX-1:5
LIN
1:1d(2+1)2n
2:1o(4+1)3d
3:3d(2+1)4n
4:3o(4+1)5d
5:5o(3+1)6d
6:6o(2+1)7d
7:5o(6+1)8d"""]

In [None]:
from urllib.parse import quote
import json

# create empty lists to hold the GlyTouCan IDs and GlycoCT strings

url = "http://csdb.glycoscience.ru/database/core/convert_api.php"
csdb_strings = []  # Create a new list to hold the GlycoCT strings

for glycoct_string in glycoct_strings:
    data = {"glycoct":glycoct_string}
    headers = {'Content-Type': 'application/json'}

    response = requests.get(f"{url}?glycoct={quote(glycoct_string)}")
    if response.status_code == 200:
        response_data = response.text.replace("<pre>", "")  # Remove the <pre> tag
        lines = response_data.split("\n")  # Split into lines
        
        for line in lines:
            if line.startswith("CSDB Linear:"):
                csdb_linear = line.replace("CSDB Linear:", "").strip()  # Extract the CSDB Linear string
                csdb_strings.append(csdb_linear)  # Append the string to the list
                break
            
    else:
        print(f"Request failed with status code {response.status_code}")

print(csdb_strings)

In [None]:
response = requests.get(f"http://csdb.glycoscience.ru/database/core/convert_api.php?csdb={quote(csdb_strings[0])}&format=smiles")
response2 = requests.get(f"http://csdb.glycoscience.ru/database/core/convert_api.php?csdb={quote(csdb_strings[1])}&format=smiles")
response3 = requests.get(f"http://csdb.glycoscience.ru/database/core/convert_api.php?csdb={quote(csdb_strings[2])}&format=smiles")

In [None]:
pattern = r"smiles=(.*?)\'"

match = re.findall(pattern, response.text)
match2 = re.findall(pattern, response2.text)
match3 = re.findall(pattern, response3.text)

In [None]:
sugar1 = Chem.MolFromSmiles(urllib.parse.unquote(match[0]))
sugar2 = Chem.MolFromSmiles(urllib.parse.unquote(match2[0]))
sugar3 = Chem.MolFromSmiles(urllib.parse.unquote(match3[0]))

In [None]:
response_data = response.text.replace("<pre>", "")  # Remove the <pre> tag
lines = response_data.split("\n")  # Split into lines

for line in lines:
    if line.startswith("CSDB Linear:"):
        csdb_linear = line.replace("CSDB Linear:", "").strip()  # Extract the CSDB Linear string
        csdb_strings.append(csdb_linear)  # Append the string to the list
        break

In [None]:
csdb_strings[0]

In [None]:
monosaccharides = glypy.monosaccharides

glcnac1 = monosaccharides["GlcNAc"]
glcnac2 = monosaccharides["GlcNAc"]
glcnac3 = monosaccharides["GlcNAc"]

glcnac1.add_monosaccharide(glcnac2, position=4, child_position=1)
bdman = monosaccharides["bdMan"]
glcnac2.add_monosaccharide(bdman, position=4, child_position=1)
adman1 = monosaccharides["adMan"]
bdman.add_monosaccharide(adman1, position=3, child_position=1)
adman2 = monosaccharides["adMan"]
bdman.add_monosaccharide(adman2, position=6, child_position=1)

n_linked_core = glypy.Glycan(root=glcnac1)
n_linked_core.reindex()
n_linked_core.canonicalize()

g1 = nx_glycan_graph(n_linked_core)

monosaccharides = glypy.monosaccharides

glcnac1 = monosaccharides["GlcNAc"]
glcnac2 = monosaccharides["GlcNAc"]
glcnac3 = monosaccharides["GlcNAc"]
bdman = monosaccharides["bdMan"]
adman1 = monosaccharides["adMan"]
adman2 = monosaccharides["adMan"]

glcnac1.add_monosaccharide(glcnac2, position=4, child_position=1)
glcnac1.add_monosaccharide(glcnac3, position=6, child_position=1)
glcnac2.add_monosaccharide(bdman, position=4, child_position=1)
bdman.add_monosaccharide(adman1, position=3, child_position=1)
bdman.add_monosaccharide(adman2, position=6, child_position=1)

n_linked_core = glypy.Glycan(root=glcnac1)
n_linked_core.reindex()
n_linked_core.canonicalize()

from glypy.io import glycoct


g2 = nx_glycan_graph(n_linked_core)

# Create a mannose monosaccharide
mannose = glypy.monosaccharides.Man

# Add mannose to the glycan at position 3 of the root
#target_monosaccharide = list(glypy_g1.root.children.values())[0][1]


In [None]:


nx.draw(g1, with_labels = True)
nx.draw(g2, with_labels = True)

In [None]:
import networkx.algorithms.isomorphism

def get_parity_score_sugar(graph1, graph2):
    
    matcher = nx.isomorphism.ISMAGS(graph1, graph2)
    largest_common_subgraph = list(matcher.largest_common_subgraph())
    mcs = {}
    current_best_matches = 0
    for match in largest_common_subgraph:
        exact_matches = 0
        for graph1_node, graph2_node in match.items():
            graph1_node_attributes = graph1.nodes[graph1_node]
            graph2_node_attributes = graph2.nodes[graph2_node]
            if graph1_node_attributes == graph2_node_attributes:
                exact_matches += 1
        print(exact_matches)
        print("_____")
        if exact_matches >= current_best_matches:
            current_best_matches = exact_matches
            mcs = match
    # check if there's a subgraph isomorphic
    return match

get_parity_score_sugar(g2,g1)

In [None]:
def node_match(node1, node2):
    return node1['constant'] == node2['constant']

def edge_match(edge1, edge2):
    return edge1['linkage'] == edge2['constant']

GM = networkx.algorithms.isomorphism.GraphMatcher(g2, g1, node_match=node_match, edge_match=edge_match)

for subgraph in GM.subgraph_isomorphisms_iter():
    print(subgraph)

In [None]:
import requests
import os
from typing import List
from rich.progress import Progress, track
from rich.console import Console

import pandas as pd
from Bio.PDB import MMCIFParser, PDBList
import os
import time

import gzip
from multiprocessing import Pool, Manager
from functools import partial
from wurcs_access import get_wurcs_single
from tqdm.notebook import tqdm

import pickle

import requests
import os
from typing import List
from rich.progress import Progress, track
from rich.console import Console

import gzip
from multiprocessing import Pool, Manager
from functools import partial

def download_pdb_files(pdb_ids: List[str], outdir='.', file_format='cif'):
    """
    Downloads PDB files in the specified format for a list of PDB ids.

    This function is adapted from the 'batch_download.sh' script provided by RCSB PDB.

    Args:
        pdb_ids: A list of PDB ids.
        outdir: The directory to save the downloaded files. Default is the current directory.
        file_format: The file format to download. Default is 'cif'.
        
    """
    base_url = "https://files.rcsb.org/download"
    console = Console()
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    download_pdbs = []
    successful_pdbs = []
    for pdb_id in pdb_ids:
        file_name = f"{pdb_id}.{file_format}"
        file_path = os.path.join(outdir, file_name)
        if not os.path.isfile(file_path):
            download_pdbs.append(pdb_id)
        else:
            successful_pdbs.append(pdb_id)
    console.log(f"Downloading {len(download_pdbs)} PDB files in {file_format} format.")
    failed_pdbs = []
    if len(download_pdbs) > 0:
        for pdb_id in track(download_pdbs, description='[green]Downloading PDBs...'):

            file_name = f"{pdb_id}.{file_format}.gz"
            file_path = os.path.join(outdir, file_name)
            url = f"{base_url}/{file_name}"
            response = requests.get(url)
            if not os.path.isfile(file_path):
                response = requests.get(url)
                if response.status_code == 200:
                    with open(os.path.join(outdir, file_name), 'wb') as f:
                        f.write(response.content)
                    successful_pdbs.append(pdb_id)
                else:
                    console.log(f"Failed to download {pdb_id}")
                    failed_pdbs.append(pdb_id)
    return successful_pdbs, failed_pdbs

def get_wurcs(pdb_entities, directory='.'):
    # create a Manager object
    with Manager() as manager:
        # create a Manager dictionary for WURCS strings and failed entities
        wurcs_dict = manager.dict()
        failed_entities = manager.list()
        
        # partial function for pool
        func = partial(get_wurcs_single, directory=directory, wurcs_dict=wurcs_dict, failed_entities=failed_entities)
        
        # create a Pool of processes
        with Pool(processes = 8) as pool:
            # apply the function to each PDB entity
            max_count = len(pdb_entities)
            with tqdm(total=max_count) as pbar:
                for i, _ in enumerate(pool.imap_unordered(func, pdb_entities)):
                    pbar.update()

        # convert the Manager dict and list back to regular dict and list
        return dict(wurcs_dict), list(failed_entities)


# read the CSV file
sugar_uniqids = pd.read_csv('sugar_uniqids.csv')

pdb_ids = []
for pdb_entity in sugar_uniqids['UniqID'].tolist():
    pdb_id, entity_id = pdb_entity.split('_')
    pdb_ids.append(pdb_id)
pdb_ids = list(set(pdb_ids))

successful_pdbs, failed_pdbs = download_pdb_files(pdb_ids, outdir='./mmcif_files', file_format='cif')

successful_pdb_entities = []
for pdb_entity in sugar_uniqids['UniqID'].tolist():
    pdb_id, entity_id = pdb_entity.split('_')
    if pdb_id in successful_pdbs:
        successful_pdb_entities.append(pdb_entity)
    
try:
    with open('sugar_uniqid_wurcs.pkl', 'rb') as f:
        wurcs_dict = pickle.load(f)
    with open('sugar_uniqid_missing_wurcs.pkl', 'rb') as f:
        failed_entities = pickle.load(f)
except:
    wurcs_dict, failed_entities = get_wurcs(successful_pdb_entities, directory='./mmcif_files')
    with open('sugar_uniqid_wurcs.pkl', 'wb') as f:
        pickle.dump(wurcs_dict, f)
    with open('sugar_uniqid_missing_wurcs.pkl', 'wb') as f:
        pickle.dump(failed_entities, f)

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://ts.glytoucan.org/sparql")
sparql.setQuery("""
    PREFIX glycan: <http://purl.jp/bio/12/glyco/glycan#>
    PREFIX glytoucan:  <http://www.glytoucan.org/glyco/owl/glytoucan#>

    SELECT (COUNT(DISTINCT ?Saccharide) as ?count)
    FROM <http://rdf.glytoucan.org/core>
    FROM <http://rdf.glytoucan.org/sequence/wurcs>
    WHERE {
        ?Saccharide glytoucan:has_primary_id ?PrimaryId .
        ?Saccharide glycan:has_glycosequence ?GlycoSequence .
        ?GlycoSequence glycan:has_sequence ?Sequence .
        ?GlycoSequence glycan:in_carbohydrate_format glycan:carbohydrate_format_wurcs.
    }
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

# Access the count result
count = int(results['results']['bindings'][0]['count']['value'])

print(count)

from SPARQLWrapper import SPARQLWrapper, JSON
# Specify the SPARQL endpoint
sparql = SPARQLWrapper("https://ts.glytoucan.org/sparql")
# initial offset
offset = 0
# limit for each chunk
limit = 10000
glycans = []


while not progress.finished:
    try:
        with open('glytoucan_records.pkl', 'rb') as f:
            responses = pickle.load(f)
        progress.update(download_glycans, advance=count)
    except:
        while True:
            # Define the query
            sparql.setQuery(f"""
                PREFIX glycan: <http://purl.jp/bio/12/glyco/glycan#>
                PREFIX glytoucan:  <http://www.glytoucan.org/glyco/owl/glytoucan#>

                SELECT DISTINCT ?Saccharide ?PrimaryId ?Sequence
                FROM <http://rdf.glytoucan.org/core>
                FROM <http://rdf.glytoucan.org/sequence/wurcs>
                WHERE {{
                    ?Saccharide glytoucan:has_primary_id ?PrimaryId .
                    ?Saccharide glycan:has_glycosequence ?GlycoSequence .
                    ?GlycoSequence glycan:has_sequence ?Sequence .
                    ?GlycoSequence glycan:in_carbohydrate_format glycan:carbohydrate_format_wurcs.
                }}
                ORDER BY ?PrimaryId

            """) #LIMIT {limit} OFFSET {offset}

            # Specify the return format
            sparql.setReturnFormat(JSON)

            # Execute the query and convert to Python dictionary
            result = sparql.query()
            # Print the raw response
            print(result.response.read())
            results = sparql.query().convert()

            # Extract the results
            bindings = results["results"]["bindings"]

            parsed_records = [{'Saccharide': record['Saccharide']['value'], 
                               'PrimaryId': record['PrimaryId']['value'], 
                               'Sequence': record['Sequence']['value']} 
                              for record in bindings]

            glycans.extend(parsed_records)

            # If the number of results is less than the limit, we've reached the end
            if len(bindings) < limit:
                break

            # Increment the offset for the next chunk
            offset += limit
            progress.update(download_glycans, advance=len(bindings))

from py2neo import Graph, Node, Relationship
graph = Graph("bolt://localhost:7687", auth=("neo4j", "test123"))  # replace "your_password" with the password you set

# Open the KCF file
with open("G10518.kcf") as f:
    nodes = {}
    edges = []
    mode = None  # Initialize the variable mode
    for line in f:
        print(line)
        if line.startswith("ENTRY"):
            mode = "entry"
            continue
        elif line.startswith("NODE"):
            mode = "node"
            continue
        elif line.startswith("EDGE"):
            mode = "edge"
            continue
        elif line.startswith("///"):
            mode = None
        if mode == "node":
            parts = line.strip().split()
            node_id = int(parts[0])
            node_label = parts[1]
            node = Node(node_label)
            nodes[node_id] = node
            graph.create(node)
            print("created node")
        elif mode == "edge":
            parts = line.strip().split()
            edge_start = int(parts[1])
            edge_end = int(parts[2])
            edge = Relationship(nodes[edge_start], "CONNECTS", nodes[edge_end])
            edges.append(edge)
            graph.create(edge)