In [48]:
import pandas as pd
import numpy as np
import re

In [71]:
# list of all iupacs from sweettalks databse
df_all = pd.read_csv('../pipeline/data/combined_database_long.csv').target.values.tolist()

In [50]:
# see if all covered from our set of IUPAC's
our_glycan_iupacs = pd.read_csv('../pipeline/data/Glycan-Structures-CFG611.txt', sep='\t').IUPAC.values.tolist()

In [51]:
all_set = set(df_all)
our_set = set(our_glycan_iupacs)

print(f'len of allset: {len(all_set)}, len of our set: {len(our_set)}, amount in our set that isnt in all set: {len(our_set - all_set)}')

len of allset: 13541, len of our set: 611, amount in our set that isnt in all set: 611


In [52]:
def convert_iupac(glycans):
    converted = []
    
    for glycan in glycans:
       
        glycan = glycan.replace('α', 'a').replace('β', 'b')
        
        # remove number before any os and op
        glycan = re.sub(r'(\D+)\d+OS', r'\1OS', glycan)
        glycan = re.sub(r'(\D+)\d+OP', r'\1OP', glycan)
        
        # remove final anomeric state and spacer     str.rsplit('(', 1)[0]
        glycan = glycan.rsplit('(', 1)[0]
        
        converted.append(glycan)
    
    return converted

In [76]:
converted_glycans = convert_iupac(our_glycan_iupacs)

In [54]:
all_set = set(df_all)
our_set = set(converted_glycans)

print(len(converted_glycans))
print(f'len of allset: {len(all_set)}, len of our set: {len(our_set)}, amount in our set that isnt in all set: {len(our_set - all_set)}')

611
len of allset: 13541, len of our set: 523, amount in our set that isnt in all set: 367


# Combine sweettalk dataset with our glycans

In [82]:

unique_df_all = list(set(df_all))
unique_our_glycans = list(set(converted_glycans))
print(f'len df all: {len(df_all)}, len unique df all {len(unique_df_all)}, len unique our glycans: {len(unique_our_glycans)}')
combined_with_ours = unique_df_all.copy()
combined_with_ours += unique_our_glycans
print('len combined with oiurs', len(combined_with_ours))

len df all: 21296, len unique df all 13541, len unique our glycans: 523
len combined with oiurs 14064


In [84]:
import csv

with open('combined_with_ours.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["target"])  # Write the header
    for iupac in combined_with_ours:
        writer.writerow([iupac])  # Write each IUPAC name as a new row

In [55]:

def motif_find(s):
  """converts a IUPACcondensed-ish glycan into a list of overlapping, asterisk-separated glycowords"""
  b = s.split('(')
  b = [k.split(')') for k in b]
  b = [item for sublist in b for item in sublist]
  b = [k.strip('[') for k in b]
  b = [k.strip(']') for k in b]
  b = [k.replace('[', '') for k in b]
  b = [k.replace(']', '') for k in b]
  b = ['*'.join(b[i:i+5]) for i in range(0, len(b)-4, 2)]
  return b

def process_glycans(glycan_list):
  """converts list of glycans into a list of lists of glycowords"""
  glycan_motifs = [motif_find(k) for k in glycan_list]
  glycan_motifs = [[i.split('*') for i in k] for k in glycan_motifs]
  return glycan_motifs

In [56]:
df_all = process_glycans(df_all)

In [57]:
df_all[:5]

[[['Glc', 'a1-3', 'Glc', 'a1-3', 'Man'],
  ['Glc', 'a1-3', 'Man', 'a1-2', 'Man'],
  ['Man', 'a1-2', 'Man', 'a1-2', 'Man'],
  ['Man', 'a1-2', 'Man', 'a1-3', 'Man'],
  ['Man', 'a1-3', 'Man', 'a1-2', 'Man'],
  ['Man', 'a1-2', 'Man', 'a1-3', 'Man'],
  ['Man', 'a1-3', 'Man', 'a1-2', 'Man'],
  ['Man', 'a1-2', 'Man', 'a1-6', 'Man'],
  ['Man', 'a1-6', 'Man', 'a1-6', 'Man'],
  ['Man', 'a1-6', 'Man', 'b1-4', 'GlcNAc'],
  ['Man', 'b1-4', 'GlcNAc', 'b1-4', 'GlcNAc']],
 [['Gal', 'a1-3', 'Galf', 'a1-3', 'Galf'],
  ['Galf', 'a1-3', 'Galf', 'a1-3', 'Man'],
  ['Galf', 'a1-3', 'Man', 'a1-3', 'Man'],
  ['Man', 'a1-3', 'Man', 'a1-4', 'GlcN']],
 [['Gal', 'a1-2', 'Man', 'b1-4', 'Rha']],
 [['Gal', 'b1-4', 'Fuc', 'a1-3', 'GlcNAc'],
  ['Fuc', 'a1-3', 'GlcNAc', 'b1-2', 'Man'],
  ['GlcNAc', 'b1-2', 'Man', 'a1-3', 'Gal'],
  ['Man', 'a1-3', 'Gal', 'b1-4', 'Fuc'],
  ['Gal', 'b1-4', 'Fuc', 'a1-3', 'GlcNAc'],
  ['Fuc', 'a1-3', 'GlcNAc', 'b1-2', 'Gal'],
  ['GlcNAc', 'b1-2', 'Gal', 'b1-4', 'Fuc'],
  ['Gal', 'b1-4', 'Fu

In [58]:
df_all = [item for sublist in df_all for item in sublist]
df_all[:5]

[['Glc', 'a1-3', 'Glc', 'a1-3', 'Man'],
 ['Glc', 'a1-3', 'Man', 'a1-2', 'Man'],
 ['Man', 'a1-2', 'Man', 'a1-2', 'Man'],
 ['Man', 'a1-2', 'Man', 'a1-3', 'Man'],
 ['Man', 'a1-3', 'Man', 'a1-2', 'Man']]

In [59]:
#make library of all observed glycowords
lib_all = list(sorted(list(set([item for sublist in df_all for item in sublist]))))
lib_size_all = len(lib_all)

In [60]:
lib_all[:8]

['',
 '1',
 '1,4-Anhydro-Gal',
 '1,4-Anhydro-Kdo',
 '1-3',
 '1-4',
 '1-6',
 '2,3-Anhydro-All']