In [None]:
#hide
#default_exp ml
from nbdev.showdoc import show_doc
from IPython.display import HTML
%load_ext autoreload
%autoreload 2

# ml
>data processing and model training + analysis using machine learning

In [None]:
#export
from glycowork.ml.model_training import *
#from glycowork.ml.models import *
#from glycowork.ml.processing import *
#from glycowork.ml.representation import *
from glycowork.ml.train_test_split import *
from glycowork.glycan_data.loader import df_species, df_glycan
import warnings
warnings.filterwarnings("ignore")

`ml` contains the code base to process glycan for machine learning, construct state-of-the-art machine learning models, train them, and analyze trained models + glycan reprsentations. It currently contains the following modules:

- `model_training` contains functions for training machine learning models
- `models` describes some examples for machine learning architectures applicable to glycans
- `processing` contains helper functions to prepare glycan data for model training
- `representation` can be used to analyze trained models and to obtain glycan representations
- `train_test_split` contains various data split functions to get appropriate training and test sets

# model_training
>contains functions for training machine learning models

In [None]:
show_doc(EarlyStopping)

In [None]:
show_doc(train_model)

In [None]:
show_doc(training_setup)

In [None]:
show_doc(train_ml_model)

In [None]:
vertebrate = [1 if k == 'Chordata' else 0 for k in df_species[df_species.Kingdom=='Animalia'].Phylum.values.tolist()]
X_train, X_test, y_train, y_test = general_split(df_species[df_species.Kingdom=='Animalia'].target.values.tolist(), vertebrate)
model_ft, _, X_test = train_ml_model(X_train, X_test, y_train, y_test, feature_calc = True, feature_set = ['exhaustive'],
                         return_features = True)

In [None]:
show_doc(analyze_ml_model)

In [None]:
analyze_ml_model(model_ft)

In [None]:
show_doc(get_mismatch)

In [None]:
get_mismatch(model_ft, X_test, y_test)

# models
>describes some examples for machine learning architectures applicable to glycans

In [None]:
#requires torch_geometric
#show_doc(SweetNet)

In [None]:
#requires torch_geometric
#show_doc(init_weights)

In [None]:
#requires torch_geometric
#show_doc(prep_model)

# processing
>contains helper functions to prepare glycan data for model training

In [None]:
#requires torch_geometric
#show_doc(dataset_to_graphs)

In [None]:
#requires torch_geometric
#show_doc(dataset_to_dataloader)

In [None]:
#requires torch_geometric
#show_doc(split_data_to_train)

# representation
>can be used to analyze trained models and to obtain glycan representations

In [None]:
#requires torch_geometric
#show_doc(glycans_to_emb)

# train_test_split
>contains various data split functions to get appropriate training and test sets

In [None]:
show_doc(hierarchy_filter)

In [None]:
train_x, val_x, train_y, val_y, id_val, class_list, class_converter = hierarchy_filter(df_species,
                                                                                       rank = 'Kingdom')
print(train_x[:10])

In [None]:
show_doc(seed_wildcard_hierarchy)

In [None]:
train_x, val_x, train_y, val_y, id_val, class_list, class_converter = hierarchy_filter(df_species,
                                                                                       rank = 'Kingdom',
                                                                                       wildcard_seed = True,
                                                                                       wildcard_list = linkages,
                                                                                       wildcard_name = 'bond')
print(train_x[-10:])

In [None]:
show_doc(general_split)

In [None]:
train_x, val_x, train_y, val_y = general_split(df_species.target.values.tolist(),
                                              df_species.Species.values.tolist())
print(train_x[:10])

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()