In [None]:
#hide
#default_exp ml
from nbdev.showdoc import show_doc
from IPython.display import HTML
%load_ext autoreload
%autoreload 2

# ml
>data processing and model training + analysis using machine learning

In [None]:
#export
#from glycowork.ml.model_training import *
#from glycowork.ml.models import *
#from glycowork.ml.processing import *
#from glycowork.ml.representation import *
from glycowork.ml.train_test_split import *
from glycowork.glycan_data.loader import df_species

`ml` contains the code base to process glycan for machine learning, construct state-of-the-art machine learning models, train them, and analyze trained models + glycan reprsentations. It currently contains the following modules:

- `model_training` contains functions for training machine learning models
- `models` describes some examples for machine learning architectures applicable to glycans
- `processing` contains helper functions to prepare glycan data for model training
- `representation` can be used to analyze trained models and to obtain glycan representations
- `train_test_split` contains various data split functions to get appropriate training and test sets

# model_training
>contains functions for training machine learning models

In [None]:
#requires torch_geometric
#show_doc(EarlyStopping)

In [None]:
#requires torch_geometric
#show_doc(train_model)

In [None]:
#requires torch_geometric
#show_doc(init_weights)

In [None]:
#requires torch_geometric
#show_doc(prep_model)

In [None]:
#requires torch_geometric
#show_doc(training_setup)

# models
>describes some examples for machine learning architectures applicable to glycans

In [None]:
#requires torch_geometric
#show_doc(SweetNet)

# processing
>contains helper functions to prepare glycan data for model training

In [None]:
#requires torch_geometric
#show_doc(dataset_to_graphs)

In [None]:
#requires torch_geometric
#show_doc(dataset_to_dataloader)

In [None]:
#requires torch_geometric
#show_doc(split_data_to_train)

# representation
>can be used to analyze trained models and to obtain glycan representations

In [None]:
#requires torch_geometric
#show_doc(glycans_to_emb)

# train_test_split
>contains various data split functions to get appropriate training and test sets

In [None]:
#hide
show_doc(hierarchy_filter)

<h4 id="hierarchy_filter" class="doc_header"><code>hierarchy_filter</code><a href="https://github.com/BojarLab/glycowork/tree/master/glycowork/ml/train_test_split.py#L34" class="source_link" style="float:right">[source]</a></h4>

> <code>hierarchy_filter</code>(**`df_in`**, **`rank`**=*`'Domain'`*, **`min_seq`**=*`5`*, **`wildcard_seed`**=*`False`*, **`wildcard_list`**=*`None`*, **`wildcard_name`**=*`None`*, **`r`**=*`0.1`*, **`col`**=*`'target'`*)

stratified data split in train/test at the taxonomic level, removing duplicate glycans and infrequent classes

df_in -- dataframe of glycan sequences and taxonomic labels

rank -- which rank should be filtered; default is 'domain'

min_seq -- how many glycans need to be present in class to keep it; default is 5

wildcard_seed -- set to True if you want to seed wildcard glycoletters; default is False

wildcard_list -- list which glycoletters a wildcard encompasses

wildcard_name -- how the wildcard should be named in the IUPACcondensed nomenclature

r -- rate of replacement, default is 0.1 or 10%

col -- column name for glycan sequences; default: target


returns train_x, val_x (lists of glycans (strings) after stratified shuffle split)

train_y, val_y (lists of taxonomic labels (mapped integers))

id_val (taxonomic labels in text form (strings))

class_list (list of unique taxonomic classes (strings))

class_converter (dictionary to map mapped integers back to text labels)

In [None]:
train_x, val_x, train_y, val_y, id_val, class_list, class_converter = hierarchy_filter(df_species,
                                                                                       rank = 'Kingdom')
print(train_x[:10])

['Glc(b1-6)GlcNAc(a1-6)GlcNAc(a1-4)GalNAc(a1-3)GlcNAc(b1-2)Glc', 'Rha(a1-4)Rha(a1-2)[Rha(a1-6)]Glc', 'Glc(b1-3)Glc(b1-3)[Glc(b1-6)]Glc(b1-3)Glc-ol', 'GalOS(b1-3)GlcNAcOS(b1-2)ManOS', 'Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)[Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc', 'Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-2)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-6)[Gal(b1-4)GlcNAc(b1-2)]Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc', 'GalNAc(b1-6)[Rha(a1-3)][Gal(b1-3)]ManNAc(b1-4)GlcA(b1-3)GalNAc', 'Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-6)]Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc', 'Glc(b1-4)[Rha(a1-3)][Glc(b1-6)]Glc(a1-3)Rha(b1-4)Glc', 'GalOS(b1-4)[Fuc(a1-3)]GlcNAc(b1-6)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-3)]GalNAc']


In [None]:
#hide
show_doc(seed_wildcard_hierarchy)

<h4 id="seed_wildcard_hierarchy" class="doc_header"><code>seed_wildcard_hierarchy</code><a href="https://github.com/BojarLab/glycowork/tree/master/glycowork/ml/train_test_split.py#L9" class="source_link" style="float:right">[source]</a></h4>

> <code>seed_wildcard_hierarchy</code>(**`glycans`**, **`labels`**, **`wildcard_list`**, **`wildcard_name`**, **`r`**=*`0.1`*)

adds dataframe rows in which glycan parts have been replaced with the appropriate wildcards

glycans -- list of IUPACcondensed glycan sequences (string)

labels -- list of labels used for prediction

wildcard_list -- list which glycoletters a wildcard encompasses

wildcard_name -- how the wildcard should be named in the IUPACcondensed nomenclature

r -- rate of replacement, default is 0.1 or 10%


returns list of glycans (strings) and labels (flexible) where some glycan parts have been replaced with wildcard_name

In [None]:
train_x, val_x, train_y, val_y, id_val, class_list, class_converter = hierarchy_filter(df_species,
                                                                                       rank = 'Kingdom',
                                                                                       wildcard_seed = True,
                                                                                       wildcard_list = linkages,
                                                                                       wildcard_name = 'bond')
print(train_x[-10:])

['[Glc(b1-6)]Gal(bond)Glc(b1-4)GlcOLac(b1-4)Gal(bond)GalOLac(bond)Gal', '[Glc(b1-6)]Gal(a1-4)Glc(bond)GlcOLac(bond)Gal(a1-4)GalOLac(a1-4)Gal', 'Araf(bond)Man(bond)Man(a1-6)[Man(bond)]Man', '[GalNAcGroP(b1-4)]Gal(bond)Glc(b1-3)GalNAc(b1-3)Gal', 'Gal(b1-6)Man(a1-6)[Man(a1-3)]Man(bond)GlcNAc(bond)[Fuc(a1-6)]GlcNAc', 'GalASer(bond)GalA(a1-3)GalAAla(a1-3)GlcNAc', 'Man(a1-2)Man(a1-6)[Galf(b1-2)][Man(a1-3)]Man(a1-6)[Man(a1-2)Man(a1-3)]Man(bond)GlcNAc(bond)GlcNAc', '6dAltf(b1-3)[6dAltf(a1-2)]6dAltf(bond)6dAltf(a1-2)6dAltf', '6dAltf(bond)[6dAltf(a1-2)]6dAltf(b1-2)6dAltf(a1-2)6dAltf', 'LDManHep(bond)6dTalOAc(bond)FucNAc(a1-2)LDManHep']


In [None]:
#hide
show_doc(general_split)

<h4 id="general_split" class="doc_header"><code>general_split</code><a href="https://github.com/BojarLab/glycowork/tree/master/glycowork/ml/train_test_split.py#L97" class="source_link" style="float:right">[source]</a></h4>

> <code>general_split</code>(**`glycans`**, **`labels`**, **`test_size`**=*`0.2`*)

splits glycans and labels into train / test sets

glycans -- list of IUPACcondensed glycan sequences (string)

labels -- list of labels used for prediction

test_size -- % size of test set; default is 0.2 / 20%


returns X_train, X_test, y_train, y_test

In [None]:
train_x, val_x, train_y, val_y = general_split(df_species.target.values.tolist(),
                                              df_species.Species.values.tolist())
print(train_x[:10])

['Man(a1-3)[Man(a1-6)]Man(a1-6)[Man(a1-2)Man(a1-3)]Man(b1-4)GlcNAc(b1-4)GlcNAc', 'XlufOMe(b1-4)Xyl(b1-4)[Gal(a1-2)][RhaOMe(a1-3)]Fuc(a1-3)[XylOMe(b1-4)]Glc', 'GalAGroN(b1-3)GalNAc(a1-4)GlcA(b1-3)GlcNAc(b1-2)GalAGroN', 'FucNAc(a1-3)QuiNAc(a1-8)8eLeg5Ac7Ac', 'GlcOAc(b1-3)[Glc(b1-2)]GlcOFer(a1-2)FrufOBzOFer', 'Gal(b1-4)Glc(b1-6)GlcOAcGroP(a1-4)Gal(a1-3)Gal', 'Gal(b1-4)Glc(b1-3)Man(b1-4)GlcA(b1-4)Gal', 'Man(b1-4)Man(b1-4)Glc', '[YerOAc(a1-4)]GalNAc(b1-3)GalNAc(a1-3)GalNAc', '[GlcA(b1-3)]GalA(a1-2)Rha(a1-4)GalA']


In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted 01_alignment.ipynb.
Converted 02_glycan_data.ipynb.
Converted 03_ml.ipynb.
Converted 04_motif.ipynb.
Converted index.ipynb.
