In [None]:
#hide
#default_exp ml
from nbdev.showdoc import show_doc
from IPython.display import HTML
%load_ext autoreload
%autoreload 2

# ml
>data processing and model training + analysis using machine learning

In [None]:
#from glycowork.ml.model_training import *
#from glycowork.ml.models import *
#from glycowork.ml.processing import *
#from glycowork.ml.representation import *
from glycowork.ml.train_test_split import *
from glycowork.glycan_data.loader import df_species

`ml` contains the code base to process glycan for machine learning, construct state-of-the-art machine learning models, train them, and analyze trained models + glycan reprsentations. It currently contains the following modules:

- `model_training` contains functions for training machine learning models
- `models` describes some examples for machine learning architectures applicable to glycans
- `processing` contains helper functions to prepare glycan data for model training
- `representation` can be used to analyze trained models and to obtain glycan representations
- `train_test_split` contains various data split functions to get appropriate training and test sets

# model_training
>contains functions for training machine learning models

In [None]:
#requires torch_geometric
#show_doc(EarlyStopping)

In [None]:
#requires torch_geometric
#show_doc(train_model)

In [None]:
#requires torch_geometric
#show_doc(init_weights)

In [None]:
#requires torch_geometric
#show_doc(prep_model)

# models
>describes some examples for machine learning architectures applicable to glycans

In [None]:
#requires torch_geometric
#show_doc(SweetNet)

# processing
>contains helper functions to prepare glycan data for model training

In [None]:
#requires torch_geometric
#show_doc(dataset_to_graphs)

In [None]:
#requires torch_geometric
#show_doc(dataset_to_dataloader)

In [None]:
#requires torch_geometric
#show_doc(split_data_to_train)

# representation
>can be used to analyze trained models and to obtain glycan representations

In [None]:
#requires torch_geometric
#show_doc(glycans_to_emb)

# train_test_split
>contains various data split functions to get appropriate training and test sets

In [None]:
show_doc(hierarchy_filter)

<h4 id="hierarchy_filter" class="doc_header"><code>hierarchy_filter</code><a href="https://github.com/BojarLab/glycoworkglycowork/ml/train_test_split.py#L34" class="source_link" style="float:right">[source]</a></h4>

> <code>hierarchy_filter</code>(**`df_in`**, **`rank`**=*`'domain'`*, **`min_seq`**=*`5`*, **`wildcard_seed`**=*`False`*, **`wildcard_list`**=*`None`*, **`wildcard_name`**=*`None`*, **`r`**=*`0.1`*, **`col`**=*`'target'`*)

stratified data split in train/test at the taxonomic level, removing duplicate glycans and infrequent classes

df_in -- dataframe of glycan sequences and taxonomic labels

rank -- which rank should be filtered; default is 'domain'

min_seq -- how many glycans need to be present in class to keep it; default is 5

wildcard_seed -- set to True if you want to seed wildcard glycoletters; default is False

wildcard_list -- list which glycoletters a wildcard encompasses

wildcard_name -- how the wildcard should be named in the IUPACcondensed nomenclature

r -- rate of replacement, default is 0.1 or 10%

col -- column name for glycan sequences; default: target


returns train_x, val_x (lists of glycans (strings) after stratified shuffle split)

train_y, val_y (lists of taxonomic labels (mapped integers))

id_val (taxonomic labels in text form (strings))

class_list (list of unique taxonomic classes (strings))

class_converter (dictionary to map mapped integers back to text labels)

In [None]:
train_x, val_x, train_y, val_y, id_val, class_list, class_converter = hierarchy_filter(df_species,
                                                                                       rank = 'kingdom')
print(train_x[:10])

['Gal(a1-4)Neu5Ac(a2-3)Gal(b1-3)GalNAcOP', 'L-GulA(a1-4)Man(b1-4)Man', '6dTalNAc(a1-3)GlcNAc(a1-3)FucNAc(a1-3)GlcNAc(b1-4)6dTalNAc', 'D-FucNAc(a1-4)ManNAcA(b1-4)GlcNAc(a1-3)Glc(b1-3)[Gal(a1-2)Gal(a1-2)]Glc(a1-3)Glc(a1-3)[GlcN(a1-7)LDManHep(a1-7)]LDManHep(a1-3)LDManHepOPPEtn(a1-5)Kdo', 'GlcNAc(a1-2)[Glc(a1-3)]LDManHep(a1-3)[Glc(b1-4)]LDManHep(a1-5)[Kdo(a2-4)]Kdo', 'Xyl(b1-3)Xyl(b1-4)Rha(a1-2)[Rha(a1-3)]Xyl', 'Fuc(a1-3)[GalNAc(b1-4)]GlcNAc(b1-3)GalNAc', 'Gal(b1-6)Gal(b1-4)Gal', 'Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-4)[Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)]Man(a1-3)[Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc', 'Man(a1-2)Man(a1-3)[Galf(b1-6)]Man']


In [None]:
show_doc(seed_wildcard_hierarchy)

<h4 id="seed_wildcard_hierarchy" class="doc_header"><code>seed_wildcard_hierarchy</code><a href="https://github.com/BojarLab/glycoworkglycowork/ml/train_test_split.py#L9" class="source_link" style="float:right">[source]</a></h4>

> <code>seed_wildcard_hierarchy</code>(**`glycans`**, **`labels`**, **`wildcard_list`**, **`wildcard_name`**, **`r`**=*`0.1`*)

adds dataframe rows in which glycan parts have been replaced with the appropriate wildcards

glycans -- list of IUPACcondensed glycan sequences (string)

labels -- list of labels used for prediction

wildcard_list -- list which glycoletters a wildcard encompasses

wildcard_name -- how the wildcard should be named in the IUPACcondensed nomenclature

r -- rate of replacement, default is 0.1 or 10%


returns list of glycans (strings) and labels (flexible) where some glycan parts have been replaced with wildcard_name

In [None]:
train_x, val_x, train_y, val_y, id_val, class_list, class_converter = hierarchy_filter(df_species,
                                                                                       rank = 'kingdom',
                                                                                       wildcard_seed = True,
                                                                                       wildcard_list = linkages,
                                                                                       wildcard_name = 'bond')
print(train_x[-10:])

['Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Neu5Gc(bond)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc', '[Gal(a1-6)]GlcNAc(bond)GalNAc(b1-3)GlcNAc', 'GalNAcOS(b1-4)GlcNAc(b1-2)Man(a1-3)[Man(a1-3)[Man(bond)]Man(bond)]Man(b1-4)GlcNAc(b1-4)GlcNAc', 'FucNAla(b1-3)QuiN(b1-4)Glc(b1-4)QuiNAc(b1-2)QuiNAc(b1-2)Rha(bond)Rha(bond)Fuc(a1-4)[Glc(a1-2)]Fuc(a1-4)Fuc(a1-4)[Glc(bond)]Fuc(a1-4)[Glc(a1-2)]Fuc(a1-4)Fuc(a1-4)[Glc(bond)]Fuc(a1-4)[Glc(a1-2)]Fuc(a1-4)Fuc(a1-4)Glc(b1-4)[GlcNAc(bond)]Fuc(a1-4)QuiNAc(b1-4)[Man(a1-2)]Man(b1-4)[Man(a1-6)]Man(a1-5)Kdo-ol', '[GlcNAcOP(bond)Man(a1-2)]Man(bond)[Man(b1-2)[GlcOP(bond)]Man(a1-2)]Man(bond)[Man(a1-2)Man(b1-2)Man(b1-2)[GlcOP(bond)]Man(a1-2)]Man(bond)Man', 'Rha(a1-2)Gal(bond)GlcNAc(b1-3)[ManNAc(b1-2)]Rha(a1-2)Rha', 'Rha(a1-2)Gal(a1-3)GlcNAc(b1-3)[ManNAc(bond)]Rha(a1-2)Rha', 'Kdo(a2-8)Kdo(a2-4)Kdo(bond)GlcOPN(b1-6)GlcOPN', 'GlcN(a1-6)Man(bond)[GlcN(a1-6)Man(a1-6)]Man(a1-5)Kdo(a2-6)GlcN(b1-6)GlcN-ol', 'Man(a1-2)Man(a1-2)[Man(a1-6)Man(a

In [None]:
show_doc(general_split)

<h4 id="general_split" class="doc_header"><code>general_split</code><a href="https://github.com/BojarLab/glycoworkglycowork/ml/train_test_split.py#L96" class="source_link" style="float:right">[source]</a></h4>

> <code>general_split</code>(**`glycans`**, **`labels`**, **`test_size`**=*`0.2`*)

splits glycans and labels into train / test sets

glycans -- list of IUPACcondensed glycan sequences (string)

labels -- list of labels used for prediction

test_size -- % size of test set; default is 0.2 / 20%


returns X_train, X_test, y_train, y_test

In [None]:
train_x, val_x, train_y, val_y = general_split(df_species.target.values.tolist(),
                                              df_species.species.values.tolist())
print(train_x[:10])

['Xyl(b1-5)Araf(a1-5)Araf', '[GlcA(b1-2)]Man(a1-3)Man(a1-3)Man(a1-3)Man(a1-3)[GlcA(b1-2)]Man(a1-3)Man(a1-3)Man(a1-3)[GlcA(b1-2)]Man(a1-3)Man(a1-3)[Xyl(b1-4)Xyl(b1-4)]Man(a1-3)Man', 'Glc(b1-4)Man(b1-4)Man', '6dTal(a1-2)Rha(a1-5)Sug', 'Glc(a1-2)Glc(b1-6)[GlcNAc(a1-2)Glc(b1-4)][Glc(b1-3)]Glc(a1-5)Kdo', 'Rha(a1-2)Glc(b1-2)Xyl', 'Glc(b1-4)Glc(b1-4)Glc(b1-3)Glc', '[GlcNAcOPyr(b1-3)]GalNAc(a1-4)Glc(a1-4)Gal(b1-3)GalNAc(b1-4)GalNAc', 'GlcA(b1-4)GalNAcOAc(b1-4)GalNAc(a1-3)GalNAc(b1-4)GlcA', 'Gal(a1-6)Gal(a1-3)Gal']
