In [None]:
#hide
#default_exp ml
from nbdev.showdoc import show_doc
from IPython.display import HTML
%load_ext autoreload
%autoreload 2

# ml
>data processing and model training + analysis using machine learning

In [None]:
#export
#from glycowork.ml.model_training import *
#from glycowork.ml.models import *
#from glycowork.ml.processing import *
#from glycowork.ml.representation import *
from glycowork.ml.train_test_split import *
from glycowork.glycan_data.loader import df_species

`ml` contains the code base to process glycan for machine learning, construct state-of-the-art machine learning models, train them, and analyze trained models + glycan reprsentations. It currently contains the following modules:

- `model_training` contains functions for training machine learning models
- `models` describes some examples for machine learning architectures applicable to glycans
- `processing` contains helper functions to prepare glycan data for model training
- `representation` can be used to analyze trained models and to obtain glycan representations
- `train_test_split` contains various data split functions to get appropriate training and test sets

# model_training
>contains functions for training machine learning models

In [None]:
#requires torch_geometric
#show_doc(EarlyStopping)

In [None]:
#requires torch_geometric
#show_doc(train_model)

In [None]:
#requires torch_geometric
#show_doc(init_weights)

In [None]:
#requires torch_geometric
#show_doc(prep_model)

# models
>describes some examples for machine learning architectures applicable to glycans

In [None]:
#requires torch_geometric
#show_doc(SweetNet)

# processing
>contains helper functions to prepare glycan data for model training

In [None]:
#requires torch_geometric
#show_doc(dataset_to_graphs)

In [None]:
#requires torch_geometric
#show_doc(dataset_to_dataloader)

In [None]:
#requires torch_geometric
#show_doc(split_data_to_train)

# representation
>can be used to analyze trained models and to obtain glycan representations

In [None]:
#requires torch_geometric
#show_doc(glycans_to_emb)

# train_test_split
>contains various data split functions to get appropriate training and test sets

In [None]:
show_doc(hierarchy_filter)

<h4 id="hierarchy_filter" class="doc_header"><code>hierarchy_filter</code><a href="https://github.com/BojarLab/glycoworkglycowork/ml/train_test_split.py#L34" class="source_link" style="float:right">[source]</a></h4>

> <code>hierarchy_filter</code>(**`df_in`**, **`rank`**=*`'domain'`*, **`min_seq`**=*`5`*, **`wildcard_seed`**=*`False`*, **`wildcard_list`**=*`None`*, **`wildcard_name`**=*`None`*, **`r`**=*`0.1`*, **`col`**=*`'target'`*)

stratified data split in train/test at the taxonomic level, removing duplicate glycans and infrequent classes

df_in -- dataframe of glycan sequences and taxonomic labels

rank -- which rank should be filtered; default is 'domain'

min_seq -- how many glycans need to be present in class to keep it; default is 5

wildcard_seed -- set to True if you want to seed wildcard glycoletters; default is False

wildcard_list -- list which glycoletters a wildcard encompasses

wildcard_name -- how the wildcard should be named in the IUPACcondensed nomenclature

r -- rate of replacement, default is 0.1 or 10%

col -- column name for glycan sequences; default: target


returns train_x, val_x (lists of glycans (strings) after stratified shuffle split)

train_y, val_y (lists of taxonomic labels (mapped integers))

id_val (taxonomic labels in text form (strings))

class_list (list of unique taxonomic classes (strings))

class_converter (dictionary to map mapped integers back to text labels)

In [None]:
train_x, val_x, train_y, val_y, id_val, class_list, class_converter = hierarchy_filter(df_species,
                                                                                       rank = 'kingdom')
print(train_x[:10])

['ManNAcOPEtn(b1-4)ManNAcOPEtn(b1-4)ManNAc(b1-3)ManNAcOPEtn', 'Gal(b1-4)GalNAc(a1-3)QuiNAc', 'Rha(a1-3)Rha(b1-4)GlcNAc(b1-3)[ManNAc(b1-2)]Rha(a1-3)Rha', 'GalOS(b1-4)GlcNAc(b1-2)[GalOS(b1-4)GlcNAc(b1-4)]Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)Man(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc', 'GalOS(b1-4)GlcNAc(b1-3)[Neu5Ac(a2-6)]GalNAc', 'Gal(a1-4)Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Gal(a1-4)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)GlcNAc', 'Glc(b1-3)D-FucNAc(a1-4)GalNAc(a1-3)GalNAc', 'FrufOAc(a1-2)Fruf(b1-2)FrufOAc', 'Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-3)]GlcNAc', 'Man(a1-2)Man(a1-2)D-Rha(a1-3)D-Rha(a1-3)Man']


In [None]:
show_doc(seed_wildcard_hierarchy)

<h4 id="seed_wildcard_hierarchy" class="doc_header"><code>seed_wildcard_hierarchy</code><a href="https://github.com/BojarLab/glycoworkglycowork/ml/train_test_split.py#L9" class="source_link" style="float:right">[source]</a></h4>

> <code>seed_wildcard_hierarchy</code>(**`glycans`**, **`labels`**, **`wildcard_list`**, **`wildcard_name`**, **`r`**=*`0.1`*)

adds dataframe rows in which glycan parts have been replaced with the appropriate wildcards

glycans -- list of IUPACcondensed glycan sequences (string)

labels -- list of labels used for prediction

wildcard_list -- list which glycoletters a wildcard encompasses

wildcard_name -- how the wildcard should be named in the IUPACcondensed nomenclature

r -- rate of replacement, default is 0.1 or 10%


returns list of glycans (strings) and labels (flexible) where some glycan parts have been replaced with wildcard_name

In [None]:
train_x, val_x, train_y, val_y, id_val, class_list, class_converter = hierarchy_filter(df_species,
                                                                                       rank = 'kingdom',
                                                                                       wildcard_seed = True,
                                                                                       wildcard_list = linkages,
                                                                                       wildcard_name = 'bond')
print(train_x[-10:])

['GlcA(b1-2)Man(a1-4)[Araf(bond)]GlcA(b1-2)[Araf(a1-5)Araf(bond)]Man(a1-4)GlcA', 'Glc(b1-3)D-FucNAc(a1-4)GalNAc(bond)GalNAc', 'DDManHep(a1-6)GlcN(a1-4)[DDManHep(bond)]GalA(a1-3)[GalA(b1-7)LDManHep(a1-7)]LDManHepOPEtn(a1-3)[Glc(b1-4)]LDManHep(a1-5)[AraN(b1-8)][Kdo(a2-4)]Kdo', 'DDManHep(a1-6)GlcN(bond)[DDManHep(a1-2)]GalA(a1-3)[GalA(b1-7)LDManHep(a1-7)]LDManHepOPEtn(a1-3)[Glc(b1-4)]LDManHep(a1-5)[AraN(b1-8)][Kdo(a2-4)]Kdo', 'Fuc(a1-3)GlcNAc(bond)DDManHep(a1-2)[Gal(b1-7)]DDManHep(a1-2)LDManHep(a1-3)LDManHepOPEtn(a1-5)Kdo', 'Man(a1-2)Man(a1-2)[Man(bond)Man(bond)]Man(a1-3)[Man(a1-2)Man(bond)[Man(a1-3)]Man(bond)]Man(b1-4)GlcNAc', 'Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)[Gal(b1-4)GlcNAc(b1-4)]Man(a1-3)[Gal(b1-4)GlcNAc(b1-2)Man(bond)]Man(b1-4)GlcNAc(b1-4)[Fuc(bond)]GlcNAc', 'Neu5Ac(a2-3)Gal(b1-4)[Fuc(a1-3)]GlcNAc(bond)[GlcNAc(b1-3)]GalNAc', 'GalNAc(bond)GalOS(b1-3)GlcNAc(b1-2)Man(bond)[ManOMe(a1-6)]Man(b1-4)GlcNAc(b1-4)[Fuc(a1-6)]GlcNAc', 'Glc(a1-3)[Glc(bond)]Glc(a1-3)Glc']


In [None]:
show_doc(general_split)

<h4 id="general_split" class="doc_header"><code>general_split</code><a href="https://github.com/BojarLab/glycoworkglycowork/ml/train_test_split.py#L96" class="source_link" style="float:right">[source]</a></h4>

> <code>general_split</code>(**`glycans`**, **`labels`**, **`test_size`**=*`0.2`*)

splits glycans and labels into train / test sets

glycans -- list of IUPACcondensed glycan sequences (string)

labels -- list of labels used for prediction

test_size -- % size of test set; default is 0.2 / 20%


returns X_train, X_test, y_train, y_test

In [None]:
train_x, val_x, train_y, val_y = general_split(df_species.target.values.tolist(),
                                              df_species.species.values.tolist())
print(train_x[:10])

['Glc(b1-6)Glc(b1-4)Glc(b1-3)Glc(b1-4)Glc', 'D-ApifOMe(b1-3)XylOMe(b1-4)RhaOMe(a1-2)D-FucOMe', 'Rha(a1-3)Glc(b1-3)Ara', 'Gal(a1-4)Gal(b1-4)GlcNAc(b1-4)[Gal(b1-4)GlcNAc(b1-2)]Man(a1-3)[Gal(a1-4)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)][GlcNAc(b1-4)]Man(b1-4)GlcNAc(b1-4)GlcNAc', 'Gal(b1-2)Gal(a1-3)[Fuc(a1-2)]Gal(b1-3)[GlcNAc(b1-6)]GalNAc', 'GalASer(a1-4)GalA(a1-3)GalAAla(a1-3)GlcNAc(b1-3)GalASer(a1-4)GalA(a1-3)GalAAla(a1-3)GlcNAc', 'Fuc(a1-2)Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-3)Gal(b1-4)Glc', 'Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-3)Gal(b1-4)GlcNAc', 'Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-3)Gal(b1-4)Glc(b1-4)[GlcNAc(a1-2)[Glc(a1-3)]LDManHepOPEtn(a1-3)]LDManHep(a1-5)[Kdo(2-4)]Kdo', 'GlcNCmOCm(b1-4)GlcNAc(b1-4)GlcNAc(b1-4)GlcNAc(b1-4)[FucOAcOMe(a1-6)]GlcNAc']
