In [None]:
#hide
#default_exp glycan_data
from nbdev.showdoc import show_doc
from IPython.display import HTML
%load_ext autoreload
%autoreload 2

# glycan_data

>Loading and handling glycan datasets

In [None]:
#export
from glycowork.glycan_data.loader import *
from glycowork.glycan_data.data_entry import *

`glycan_data` contains several helper functions for glycan data loading and data entry. Helper functions for data loading and data objects are in `loader` and include:
- `unwrap` flattens nested lists
- `find_nth` returns the n-th occurrence of a motif
- `df_species` loaded file for all glycans with species information, one row per glycan-species combination
- `df_glycan` loaded file containing all unique glycans with lots of meta-information
- `df_glysum` loaded glycan substitution matrix for glycan alignment
- `lib` library of unique monosaccharides + linkages in stored datasets; generated with `get_lib`

# loader

In [None]:
show_doc(unwrap)

<h4 id="unwrap" class="doc_header"><code>unwrap</code><a href="https://github.com/BojarLab/glycoworkglycowork/glycan_data/loader.py#L20" class="source_link" style="float:right">[source]</a></h4>

> <code>unwrap</code>(**`nested_list`**)

converts a nested list into a flat list

In [None]:
unwrap([[1,2], [3,4]])

[1, 2, 3, 4]

In [None]:
show_doc(find_nth)

<h4 id="find_nth" class="doc_header"><code>find_nth</code><a href="https://github.com/BojarLab/glycoworkglycowork/glycan_data/loader.py#L25" class="source_link" style="float:right">[source]</a></h4>

> <code>find_nth</code>(**`haystack`**, **`needle`**, **`n`**)

finds n-th instance of motif

haystack -- string to search for motif

needle -- motif

n -- n-th occurrence in string


returns starting index of n-th occurrence in string 

In [None]:
find_nth('This is as good as it gets', 'as', 2)

16

In [None]:
HTML(df_species.head().to_html())

Unnamed: 0,target,species,genus,family,order,class,phylum,kingdom,domain,ref
0,Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc,Acer_pseudoplatanus,Acer,Sapindaceae,Sapindales,Dicotyledons,Angiosperms,Plantae,Eukarya,
1,GlcNAc(b1-2)Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc,Acer_pseudoplatanus,Acer,Sapindaceae,Sapindales,Dicotyledons,Angiosperms,Plantae,Eukarya,
2,GlcNAc(b1-2)Man(a1-6)[Man(a1-3)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc,Acer_pseudoplatanus,Acer,Sapindaceae,Sapindales,Dicotyledons,Angiosperms,Plantae,Eukarya,
3,Fuc(a1-6)[Gal(b1-4)]GlcNAc(b1-2)Man(a1-6)[GlcNAc(b1-2)Man(a1-3)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc,Acer_pseudoplatanus,Acer,Sapindaceae,Sapindales,Dicotyledons,Angiosperms,Plantae,Eukarya,
4,GlcNAc(b1-2)Man(a1-3)[GlcNAc(b1-2)Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc,Acer_pseudoplatanus,Acer,Sapindaceae,Sapindales,Dicotyledons,Angiosperms,Plantae,Eukarya,


In [None]:
HTML(df_glycan.head().to_html())

Unnamed: 0,glycan_id,glycan,species,immunogenicity,inferred_origin,link,genus,family,order,class,phylum,kingdom,domain,WURCS,glytoucan_acc
0,1,Glc(a1-1)[L-Xyl(b1-3)L-Xyl(b1-3)L-Xyl(b1-3)L-Xyl(b1-3)L-Xyl(b1-3)L-Xyl(b1-4)L-Aco(a1-3)Gal(b1-3)Glc(b1-4)]Glc,[],,Bacteria,,[],[],[],[],[],[],[],,
1,2,GlcNAc(b1-2)[Gal(b1-3)[Neu5Ac(a2-6)]GlcNAc(b1-4)]Man(a1-3)[GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc,[],,Chordata,,[],[],[],[],[],[],[],"WURCS=2.0/2,5,4/[a2122h-1b_1-5][a2211m-1a_1-5]/1-2-2-2-2/a2-b1_b4-c1_c3-d1_c4-e1",G52117LP
2,3,Man(b1-2)Man(b1-2)D-4dLyxHexOMe,['Candida_albicans'],,,,['Candida'],['Saccharomycetaceae'],['Saccharomycetales'],['Saccharomycetes'],['Ascomycota'],['Fungi'],['Eukarya'],,
3,4,Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)Glc,[],,Eukarya,,[],[],[],[],[],[],[],"WURCS=2.0/5,11,10/[a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5][a2112h-1b_1-5][a1221m-1a_1-5]/1-1-2-3-1-1-4-3-1-1-5/a4-b1_a6-k1_b4-c1_c3-d1_c6-h1_d2-e1_d4-f1_f3-g1_h2-i1_h6-j1",G46712PI
4,5,GalA(a1-4)GalA(a1-4)GalA(a1-2)[GalOMe(a1-4)GalOMe(a1-4)]Rha,[],,Angiosperms,,[],[],[],[],[],[],[],"WURCS=2.0/2,2,1/[a2122A-1x_1-5_6*OC][a2122h-1a_1-5]/1-2/a4-b1",G69176HJ


# data_entry

Helper functions for entering data are contained in `data_entry` and include:

- `check_presence` tests whether the entered glycan is already in database

In [None]:
show_doc(check_presence)

<h4 id="check_presence" class="doc_header"><code>check_presence</code><a href="https://github.com/BojarLab/glycoworkglycowork/glycan_data/data_entry.py#L5" class="source_link" style="float:right">[source]</a></h4>

> <code>check_presence</code>(**`glycan`**, **`df`**, **`colname`**=*`'target'`*, **`libr`**=*`None`*, **`name`**=*`None`*, **`rank`**=*`'species'`*, **`fast`**=*`False`*)

checks whether glycan (of that species) is already present in dataset

glycan -- IUPACcondensed glycan sequence (string)

df -- glycan dataframe where glycans are under colname and ideally taxonomic labels are columns

libr -- sorted list of unique glycoletters observed in the glycans of our dataset

name -- name of the species (etc.) of interest; string

rank -- column name for filtering; default: species

fast -- True uses precomputed glycan graphs, only use if df has column 'graph' with glycan graphs


returns text output regarding whether the glycan is already in df

In [None]:
print("Check presence of Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc")
check_presence('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc', df_species)

print("Check presence of Fuc(b1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc")
check_presence('Fuc(b1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc', df_species)

print("Check presence of Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc in the species Danielus Bojarum")
check_presence('Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc', df_species,
               name = 'Danielus Bojarum')

Check presence of Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc
Glycan already in dataset.
Check presence of Fuc(b1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc
It's your lucky day, this glycan is new!
Check presence of Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc in the species Danielus Bojarum
This is the best: Danielus_Bojarum is not in dataset
It's your lucky day, this glycan is new!


In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted 01_alignment.ipynb.
Converted 02_glycan_data.ipynb.
Converted 03_ml.ipynb.
Converted 04_motif.ipynb.
Converted index.ipynb.
