In [None]:
#hide
#default_exp motif
from nbdev.showdoc import show_doc
from IPython.display import HTML
%load_ext autoreload
%autoreload 2

# motif

>Processing and analyzing glycan sequences on a motif-basis

In [None]:
#export
from glycowork.motif.analysis import *
from glycowork.motif.annotate import *
from glycowork.motif.graph import *
from glycowork.motif.processing import *
from glycowork.motif.query import *
from glycowork.motif.tokenization import *
from glycowork.glycan_data.loader import df_species

`motif` contains many functions to process glycans in various ways and use this processing analyze glycans via curated motifs, graph features, and sequence features. It contains the following modules:

- `analysis` contains functions for downstream analyses of important glycan motifs etc.
- `annotate` contains functions to extract curated motifs, graph features, and sequence features from glycan sequences
- `graph` is used to convert glycan sequences to graphs and contains helper functions to search for motifs / check whether two sequences describe the same sequence, etc.
- `processing` contains all the basic functions to process IUPACcondensed glycan sequences into glycoletters etc. Also contains `get_lib` to construct the glycoletter library
- `query` is used to interact with the databases contained in glycowork, delivering insights for sequences of interest
- `tokenization` has helper functions to map glycans (strings) to numbers that can be used by models

# analysis
 >downstream analyses of important glycan motifs

In [None]:
show_doc(get_pvals_motifs)

In [None]:
glycans = ['Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc',
           'Man(a1-2)Man(a1-2)Man(a1-3)[Man(a1-3)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
           'GalNAc(a1-4)GlcNAcA(a1-4)[GlcN(b1-7)]Kdo(a2-5)[Kdo(a2-4)]Kdo(a2-6)GlcOPN(b1-6)GlcOPN',
          'Man(a1-2)Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
           'Glc(b1-3)Glc(b1-3)Glc']
label = [3.234, 2.423, 0.733, 3.102, 0.108]
test_df = pd.DataFrame({'glycan':glycans, 'binding':label})

print("Glyco-Motif enrichment p-value test")
out = get_pvals_motifs(test_df, 'glycan', 'binding').iloc[:10,:]

In [None]:
#hide_input
HTML(out.style.set_properties(**{'font-size': '11pt', 'font-family': 'Helvetica','border-collapse': 'collapse','border': '1px solid black'}).render())

In [None]:
show_doc(make_heatmap)

In [None]:
glycans = ['Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc',
           'Man(a1-2)Man(a1-2)Man(a1-3)[Man(a1-3)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
           'GalNAc(a1-4)GlcNAcA(a1-4)[GlcN(b1-7)]Kdo(a2-5)[Kdo(a2-4)]Kdo(a2-6)GlcN4P(b1-6)GlcN4P',
           'Man(a1-2)Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
           'Glc(b1-3)Glc(b1-3)Glc']
label = [3.234, 2.423, 0.733, 3.102, 0.108]
label2 = [0.134, 0.345, 1.15, 0.233, 2.981]
label3 = [0.334, 0.245, 1.55, 0.133, 2.581]
test_df = pd.DataFrame([label, label2, label3], columns = glycans)

make_heatmap(test_df)

In [None]:
make_heatmap(test_df, mode = 'motif', feature_set = ['known', 'exhaustive'], estimate_speedup = True)

In [None]:
show_doc(plot_embeddings)

In [None]:
df_fabales = df_species[df_species.Order == 'Fabales'].reset_index(drop = True)
plot_embeddings(df_fabales.target.values.tolist(), label_list = df_fabales.Family.values.tolist())

In [None]:
show_doc(characterize_monosaccharide)

In [None]:
characterize_monosaccharide('D-Rha', rank = 'Kingdom', focus = 'Bacteria', modifications = True)

# annotate
>extract curated motifs, graph features, and sequence features from glycan sequences

In [None]:
show_doc(annotate_glycan)

In [None]:
show_doc(annotate_dataset)

In [None]:
glycans = ['Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc',
           'Man(a1-2)Man(a1-2)Man(a1-3)[Man(a1-3)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
           'GalNAc(a1-4)GlcNAcA(a1-4)[GlcN(b1-7)]Kdo(a2-5)[Kdo(a2-4)]Kdo(a2-6)GlcN4P(b1-6)GlcN4P']
print("Annotate Test")
out = annotate_dataset(glycans)

In [None]:
#hide_input
HTML(out.style.set_properties(**{'font-size': '11pt', 'font-family': 'Helvetica','border-collapse': 'collapse','border': '1px solid black'}).render())

In [None]:
show_doc(get_trisaccharides)

In [None]:
get_trisaccharides('GalNAc(a1-4)GlcNAcA(a1-4)[GlcN(b1-7)]Kdo(a2-5)[Kdo(a2-4)]Kdo(a2-6)GlcN4P(b1-6)GlcN4P')

# graph
>convert glycan sequences to graphs and contains helper functions to search for motifs / check whether two sequences describe the same sequence, etc.

In [None]:
show_doc(glycan_to_graph)

In [None]:
print('Glycan to Graph')
print('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc')
print(glycan_to_graph('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc'))

In [None]:
show_doc(compare_glycans)

In [None]:
print("Graph Isomorphism Test")
print(compare_glycans('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc',
                      'Man(a1-6)[Man(a1-3)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc'))

In [None]:
show_doc(fast_compare_glycans)

In [None]:
show_doc(subgraph_isomorphism)

In [None]:
print("Subgraph Isomorphism Test")
print(subgraph_isomorphism('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc',
                           'Fuc(a1-6)GlcNAc'))

In [None]:
show_doc(glycan_to_nxGraph)

In [None]:
show_doc(graph_to_string)

In [None]:
graph_to_string(glycan_to_nxGraph('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc'))

In [None]:
print('Glycan to networkx Graph (only edges printed)')
print(glycan_to_nxGraph('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc').edges())

In [None]:
show_doc(generate_graph_features)

In [None]:
show_doc(try_string_conversion)

# processing
>process IUPACcondensed glycan sequences into glycoletters etc.

In [None]:
show_doc(check_nomenclature)

In [None]:
check_nomenclature('Glc(b1-4)Glc')

In [None]:
show_doc(small_motif_find)

In [None]:
small_motif_find('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc')

In [None]:
show_doc(min_process_glycans)

In [None]:
min_process_glycans(['Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
                     'Man(a1-2)Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc'])

In [None]:
show_doc(motif_find)

In [None]:
show_doc(process_glycans)

In [None]:
process_glycans(['Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
                     'Man(a1-2)Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc'])

In [None]:
show_doc(get_lib)

In [None]:
get_lib(['Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
                     'Man(a1-2)Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc'])

In [None]:
show_doc(expand_lib)

In [None]:
lib1 = get_lib(['Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
                     'Man(a1-2)Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc'])
lib2 = expand_lib(lib1, ['Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc'])
lib2

In [None]:
show_doc(seed_wildcard)

In [None]:
show_doc(presence_to_matrix)

In [None]:
out = presence_to_matrix(df_species[df_species.Order == 'Fabales'].reset_index(drop = True),
                         label_col_name = 'Family')

In [None]:
#hide_input
HTML(out.style.set_properties(**{'font-size': '11pt', 'font-family': 'Helvetica','border-collapse': 'collapse','border': '1px solid black'}).render())

# query
>for interacting with the databases contained in glycowork, delivering insights for sequences of interest

In [None]:
show_doc(get_insight)

In [None]:
print("Test get_insight with 'Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc'")
get_insight('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc')

In [None]:
show_doc(glytoucan_to_glycan)

In [None]:
glytoucan_to_glycan(['G59003TN', 'G88185OE'])

# tokenization
>helper functions to map glycans (strings) to numbers that can be used by models

In [None]:
show_doc(string_to_labels)

In [None]:
string_to_labels(['Man','a1-3','Man','a1-6','Man'])

In [None]:
show_doc(pad_sequence)

In [None]:
pad_sequence(string_to_labels(['Man','a1-3','Man','a1-6','Man']), 7)

In [None]:
show_doc(convert_to_counts_glycoletter)

In [None]:
show_doc(glycoletter_count_matrix)

In [None]:
show_doc(find_isomorphs)

In [None]:
find_isomorphs('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc')

In [None]:
show_doc(link_find)

In [None]:
link_find('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc')

In [None]:
show_doc(motif_matrix)

In [None]:
glycans = ['Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc',
           'Man(a1-2)Man(a1-2)Man(a1-3)[Man(a1-3)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc',
           'GalNAc(a1-4)GlcNAcA(a1-4)[GlcN(b1-7)]Kdo(a2-5)[Kdo(a2-4)]Kdo(a2-6)GlcN4P(b1-6)GlcN4P']
label = [1,1,0]
test_df = pd.DataFrame({'glycan':glycans, 'eukaryotic':label})
print("Glyco-Motif Test")
out = motif_matrix(test_df, 'glycan', 'eukaryotic')

In [None]:
#hide_input
HTML(out.style.set_properties(**{'font-size': '11pt', 'font-family': 'Helvetica','border-collapse': 'collapse','border': '1px solid black'}).render())

In [None]:
show_doc(match_composition)

In [None]:
match_composition({"Glc":1, "Gal":1, "Neu5Ac":1}, "Homo_sapiens", "Species", mode = "exact")

In [None]:
show_doc(match_composition_relaxed)

In [None]:
match_composition_relaxed({"Hex":3, "HexNAc":2, "dHex":1}, "Homo_sapiens", "Species")

In [None]:
show_doc(condense_composition_matching)

In [None]:
match_comp = match_composition_relaxed({'Hex':1, 'HexNAc':1, 'Neu5Ac':1}, 'Homo_sapiens', 'Species', reducing_end = 'GalNAc')
print(match_comp)
condense_composition_matching(match_comp)

In [None]:
show_doc(compositions_to_structures)

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()