In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.utils import load_config, seed_everything

In [2]:
# for reproducibility
seed_everything(42)

# config file
config = load_config("../configs/config.yaml")

# Set Seaborn style
sns.set(style=config['sns_params']['style'],
        palette=config['sns_params']['palette'],
        font_scale=config['sns_params']['font_scale'])

# Set Matplotlib parameters
plt.rcParams.update(config['plt_params'])

### First look into data

In [3]:
df_glycan = pd.read_pickle(config['paths']['df_glycan_path'])
df_glycan.head(5)

Unnamed: 0,glycan,Species,Genus,Family,Order,Class,Phylum,Kingdom,Domain,ref,...,disease_id,disease_sample,disease_direction,disease_ref,disease_species,tissue_sample,tissue_id,tissue_ref,tissue_species,Composition
0,Gal(b1-4)Glc-ol,"[Acinonyx_jubatus, Addax_nasomaculatus, Aepyce...","[Acinonyx, Addax, Aepyceros, Ailuropoda, Alcel...","[Felidae, Bovidae, Bovidae, Ursidae, Bovidae, ...","[Carnivora, Artiodactyla, Artiodactyla, Carniv...","[Mammalia, Mammalia, Mammalia, Mammalia, Mamma...","[Chordata, Chordata, Chordata, Chordata, Chord...","[Animalia, Animalia, Animalia, Animalia, Anima...","[Eukarya, Eukarya, Eukarya, Eukarya, Eukarya, ...","[https://pubmed.ncbi.nlm.nih.gov/31828568/, ht...",...,[],[],[],[],[],"[milk, milk, milk, milk, milk, milk, milk, mil...","[UBERON:0001913, UBERON:0001913, UBERON:000191...","[https://pubmed.ncbi.nlm.nih.gov/31828568/, ht...","[Acinonyx_jubatus, Addax_nasomaculatus, Ailuro...",{'Hex': 2}
1,Neu5Ac(a2-3)Gal(b1-4)Glc1Cer,"[Alces_alces, Balaenoptera_acutorostrata, Bos_...","[Alces, Balaenoptera, Bos, Bubalus, Campylobac...","[Cervidae, Balaenopteridae, Bovidae, Bovidae, ...","[Artiodactyla, Artiodactyla, Artiodactyla, Art...","[Mammalia, Mammalia, Mammalia, Mammalia, Epsil...","[Chordata, Chordata, Chordata, Chordata, Prote...","[Animalia, Animalia, Animalia, Animalia, Bacte...","[Eukarya, Eukarya, Eukarya, Eukarya, Bacteria,...","[https://pubmed.ncbi.nlm.nih.gov/26104834/, ht...",...,"[DOID:83, DOID:5409, DOID:1909]","[lens, tumor, skin]","[up, up, up]","[https://pubmed.ncbi.nlm.nih.gov/7905480/, htt...","[Homo_sapiens, Homo_sapiens, Cricetulus_griseus]","[A549_cell_line, AML_193_cell_line, CHOK1_cell...","[cellosaurus:CVCL_0023, cellosaurus:CVCL_1071,...","[https://pubmed.ncbi.nlm.nih.gov/23345451/, ht...","[Homo_sapiens, Homo_sapiens, Homo_sapiens, Hom...","{'Neu5Ac': 1, 'Hex': 2}"
2,Glc1Cer,"[Acaudina_molpadioides, Acholeplasma_axanthum,...","[Acaudina, Acholeplasma, Agama, Agama, Agelas,...","[Caudinidae, Acholeplasmataceae, Agamidae, Aga...","[Molpadiida, Acholeplasmatales, Squamata, Squa...","[Holothuroidea, Mollicutes, Reptilia, Reptilia...","[Echinodermata, Firmicutes, Chordata, Chordata...","[Animalia, Bacteria, Animalia, Animalia, Anima...","[Eukarya, Bacteria, Eukarya, Eukarya, Eukarya,...","[https://pubmed.ncbi.nlm.nih.gov/22004409/, ht...",...,[DOID:1909],[skin],[up],[https://pubmed.ncbi.nlm.nih.gov/2582447/],[Cricetulus_griseus],"[COS7_cell_line, HT29_cell_line, M1_cell_line,...","[cellosaurus:CVCL_0224, cellosaurus:CVCL_A8EZ,...","[https://pubmed.ncbi.nlm.nih.gov/20157020/, ht...","[Chlorocebus_sabaeus, Homo_sapiens, Mus_muscul...",{'Hex': 1}
3,Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Gal(b1-4)GlcNAc...,"[Angiostrongylus_cantonensis, AvianInfluenzaA_...","[Angiostrongylus, Alphainfluenzavirus, Bos, Bo...","[Angiostrongylidae, Orthomyxoviridae, Bovidae,...","[Rhabditida, Articulavirales, Artiodactyla, Ar...","[Chromadorea, Insthoviricetes, Mammalia, Mamma...","[Nematoda, Negarnaviricota, Chordata, Chordata...","[Animalia, Riboviria, Animalia, Animalia, Anim...","[Eukarya, Virus, Eukarya, Eukarya, Eukarya, Eu...","[https://pubmed.ncbi.nlm.nih.gov/26650734/, ht...",...,"[DOID:8778, DOID:11729, DOID:9256, DOID:008052...","[serum, serum, tumor, serum, serum, serum, ser...","[down, up, down, down, down, down, down, down,...","[https://pubmed.ncbi.nlm.nih.gov/34643622/, ht...","[Homo_sapiens, Homo_sapiens, Homo_sapiens, Hom...","[2A3_cell_line, A9_fibroblast_cell_line, BMMC_...","[cellosaurus:CVCL_0D71, cellosaurus:CVCL_3984,...","[https://pubmed.ncbi.nlm.nih.gov/36289103/, ht...","[Homo_sapiens, Mus_musculus, Homo_sapiens, Hom...","{'Hex': 5, 'HexNAc': 4, 'dHex': 1}"
4,Man(a1-2)Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-...,"[Adeno-associated_dependoparvovirusA, Angiostr...","[Dependoparvovirus, Angiostrongylus, Arabidops...","[Parvoviridae, Angiostrongylidae, Brassicaceae...","[Piccovirales, Rhabditida, Brassicales, Lepido...","[Quintoviricetes, Chromadorea, Dicotyledons, I...","[Cossaviricota, Nematoda, Angiosperms, Arthrop...","[Shotokuvirae, Animalia, Plantae, Animalia, An...","[Virus, Eukarya, Eukarya, Eukarya, Eukarya, Eu...","[https://pubmed.ncbi.nlm.nih.gov/37774344/, ht...",...,"[DOID:9965, SYMP:0000633, DOID:3908, DOID:3969...","[serum, plasma, serum, serum, serum, urine, , ...","[up, down, down, up, up, up, , , , ]","[https://pubmed.ncbi.nlm.nih.gov/32123198/, ht...","[Mus_musculus, Homo_sapiens, Homo_sapiens, Hom...","[2A3_cell_line, B_cell, CHOK1_cell_line, CHOS_...","[cellosaurus:CVCL_0D71, CL:0000236, cellosauru...","[https://pubmed.ncbi.nlm.nih.gov/36289103/, ht...","[Homo_sapiens, Homo_sapiens, Cricetulus_griseu...","{'Hex': 6, 'HexNAc': 2}"


In [4]:
df_species = pd.read_pickle(config['paths']['df_species_path'])
df_species.head(5)

Unnamed: 0,glycan,Species,Genus,Family,Order,Class,Phylum,Kingdom,Domain,ref
0,Gal(a1-2)[Rha3Me(a1-3)][Xyl(b1-4)]Fuc(a1-3)[Xy...,ATCV-1,Chlorovirus,Phycodnaviridae,Algavirales,Megaviricetes,Nucleocytoviricota,Bamfordvirae,Virus,https://pubmed.ncbi.nlm.nih.gov/26582281/
1,Gal(a1-2)[Rha3Me(a1-3)][Xyl(b1-4)]Fuc(a1-3)[Xy...,ATCV-1,Chlorovirus,Phycodnaviridae,Algavirales,Megaviricetes,Nucleocytoviricota,Bamfordvirae,Virus,https://pubmed.ncbi.nlm.nih.gov/26582281/
2,Gal(a1-2)[Rha3Me(a1-3)][Xyl4Me(b1-4)]Fuc(a1-3)...,ATCV-1,Chlorovirus,Phycodnaviridae,Algavirales,Megaviricetes,Nucleocytoviricota,Bamfordvirae,Virus,https://pubmed.ncbi.nlm.nih.gov/26582281/
3,Gal(a1-2)[Rha3Me(a1-3)][Xyl4Me(b1-4)]Fuc(a1-3)...,ATCV-1,Chlorovirus,Phycodnaviridae,Algavirales,Megaviricetes,Nucleocytoviricota,Bamfordvirae,Virus,https://pubmed.ncbi.nlm.nih.gov/26582281/
4,GalA(a1-2)Rha(a1-4)GalA,Abelmoschus_esculentus,Abelmoschus,Malvaceae,Malvales,Dicotyledons,Angiosperms,Plantae,Eukarya,


In [5]:
df_bind = pd.read_pickle(config['paths']['glycan_binding_path'])
df_bind

Unnamed: 0,3-Anhydro-Gal(a1-3)Gal(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal2S(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3dGal(b1-3)[Fuc(a1-4)]Glc,3dGal(b1-4)Glc,4d8dNeu5Ac(a2-3)Gal(b1-4)Glc,4dNeu5Ac(a2-3)Gal(b1-4)Glc,7dNeu5Ac(a2-3)Gal(b1-4)Glc,...,wwwSflexneri5c,wwwSflexneriO2c,wwwSflexneriO5c,wwwSisomicin,wwwSmix,wwwTobramycin,wwwTyrS,wwwpHGGs,target,protein
0,,,,,,,,,,,...,,,,,,,,,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,TAL6-4LysM
1,,,,,,,,,,,...,,,,,,,,,AAFFSLVVLLALLPFGIHASALPSTELTPRVNPNLPGPNDVFVGFR...,rCnSL-proA
2,,,,,,,,,,,...,,,,,,,,,AANEADYQAKLTAYQTELARVQKANADAKAAYEAAVAANNAANAAL...,AntigenI/IIA3VP1
3,,,,,,,,,,,...,,,,,,,,,AASKLGVPQPAQRDQVNCQLYAVQPNDNCIDISSKNNITYAQLLSW...,TAL6-6LysM
4,,,,,,,,,,,...,,,,,,,,,ACNNEWEDEQYEQYISFKSPIPAGGEGVTDIYVRYKEDGKVTYRLP...,SP15308A-bot-339-19-339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1460,,,,,,,,,,,...,,,,,,,,,YGAPAGPLIVPYNLPLPGGVVPRMLITILGTVKPNANRIALDFQRG...,Gal_3C
1461,,,,,,,,,,,...,,,,,,,,,YIGDFRCIQLVNSNGANVSAPSISTETVEVSQGLGTYYVLDRVYLN...,cov/mhv/MERS
1462,,,,,,,,,,,...,,,,,,,,,YKLVCYFTQWSQDRQEPGKFTPENIDPFLCSHLIYSFASIENNKVI...,chitinase 3-like protein 2
1463,,,,,,,,,,,...,,,,,,,,,YRRTCSHTGKGEGWYIIRRGDNFNAVAADFCTSTNVLTEWNHISTI...,Vd2LysM


In [6]:
df_glycan_list = pd.read_csv(config['paths']['glycan_list_path'])
df_glycan_list

Unnamed: 0,glycan,Composition,tissue_species,tissue_sample
0,Fuc(a1-?)GlcNAc(b1-2)Man(a1-6)[GlcNAc(b1-2)Man...,"{'dHex': 2, 'HexNAc': 4, 'Hex': 3}",['Homo_sapiens'],['blood']
1,Neu5Ac(a2-?)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Glc...,"{'Neu5Ac': 1, 'Hex': 4, 'HexNAc': 4, 'dHex': 1}",['Homo_sapiens'],['blood']
2,Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal...,"{'Neu5Ac': 1, 'Hex': 5, 'HexNAc': 4}",['Homo_sapiens'],['blood']
3,Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Glc...,"{'Neu5Ac': 1, 'Hex': 4, 'HexNAc': 4}",['Homo_sapiens'],['blood']
4,Fuc(a1-2)[GalNAc(a1-3)]Gal(b1-4)GlcNAc(b1-2)Ma...,"{'dHex': 1, 'HexNAc': 5, 'Hex': 5}",['Homo_sapiens'],['blood']


In [7]:
df_Nglycan = pd.read_pickle(config['paths']['N_glycans_df_path'])
df_Nglycan.head(5)

Unnamed: 0,glycan,Species,Genus,Family,Order,Class,Phylum,Kingdom,Domain,ref,...,disease_sample,disease_direction,disease_ref,disease_species,tissue_sample,tissue_id,tissue_ref,tissue_species,Composition,Structure_Type
1,Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal...,"[Cricetulus_griseus, Homo_sapiens, Mus_musculu...","[Cricetulus, Homo, Mus, Ovis, Rattus]","[Cricetidae, Hominidae, Muridae, Bovidae, Muri...","[Rodentia, Primates, Rodentia, Artiodactyla, R...","[Mammalia, Mammalia, Mammalia, Mammalia, Mamma...","[Chordata, Chordata, Chordata, Chordata, Chord...","[Animalia, Animalia, Animalia, Animalia, Anima...","[Eukarya, Eukarya, Eukarya, Eukarya, Eukarya]","[, , https://unicarb-dr.glycosmos.org/referenc...",...,[],[],[],[],"[2A3_cell_line, AML_193_cell_line, Cal-27_cell...","[cellosaurus:CVCL_0D71, cellosaurus:CVCL_1071,...","[https://pubmed.ncbi.nlm.nih.gov/36289103/, ht...","[Homo_sapiens, Homo_sapiens, Homo_sapiens, Hom...","{'Neu5Ac': 1, 'Hex': 5, 'HexNAc': 4}",Complex_Gal
2,Gal(a1-3)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal(b1...,"[Bos_taurus, Ginglymostoma_cirratum, Mus_muscu...","[Bos, Ginglymostoma, Mus, Sus]","[Bovidae, Ginglymostomatidae, Muridae, Suidae]","[Artiodactyla, Orectolobiformes, Rodentia, Art...","[Mammalia, Chondrichthyes, Mammalia, Mammalia]","[Chordata, Chordata, Chordata, Chordata]","[Animalia, Animalia, Animalia, Animalia]","[Eukarya, Eukarya, Eukarya, Eukarya]","[, https://pubmed.ncbi.nlm.nih.gov/19156518/, ...",...,[],[],[],[],[],[],[],[],"{'Hex': 6, 'HexNAc': 4}",Complex_Gal
3,Neu5Ac(a2-?)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal...,"[Notamacropus_eugenii, Homo_sapiens, Homo_sapi...","[Notamacropus, Homo, Homo]","[Macropodidae, Hominidae, Hominidae]","[Diprotodontia, Primates, Primates]","[Mammalia, Mammalia, Mammalia]","[Chordata, Chordata, Chordata]","[Animalia, Animalia, Animalia]","[Eukarya, Eukarya, Eukarya]","[https://pubmed.ncbi.nlm.nih.gov/23053637/, ht...",...,"[, ]","[, ]","[, ]","[, ]","[urine, urine]","[UBERON:0001088, UBERON:0001088]","[https://pubmed.ncbi.nlm.nih.gov/33650863/, ht...","[Homo_sapiens, Homo_sapiens]","{'Neu5Ac': 1, 'Hex': 5, 'HexNAc': 4}",Complex_Gal
4,Fuc(a1-2)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal(b1...,[Homo_sapiens],[Homo],[Hominidae],[Primates],[Mammalia],[Chordata],[Animalia],[Eukarya],[],...,[],[],[],[],[plasma],[ENVO:01000798],[https://pubmed.ncbi.nlm.nih.gov/1577715/],[Homo_sapiens],"{'dHex': 1, 'Hex': 5, 'HexNAc': 4}",Complex_Gal
5,Fuc(a1-2)[Gal(a1-3)]Gal(b1-4)GlcNAc(b1-2)Man(a...,[Homo_sapiens],[Homo],[Hominidae],[Primates],[Mammalia],[Chordata],[Animalia],[Eukarya],[],...,[],[],[],[],[plasma],[ENVO:01000798],[https://pubmed.ncbi.nlm.nih.gov/1577715/],[Homo_sapiens],"{'dHex': 1, 'Hex': 6, 'HexNAc': 4}",Complex_Gal


In [23]:
import numpy as np
all_len = [len(x) for x in df_glycan['glycan'].tolist()]
max(all_len)

1161

In [24]:
np.argmax(all_len)

np.int64(4399)

In [32]:
np.quantile(all_len, 0.9)

np.float64(173.0)

In [25]:
df_glycan['glycan'][4399]

'Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf'

In [13]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('../results/misc/tokenizer/')

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/envs/glycan/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/glycan/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/envs/glycan/lib/python3.11/site-packages/ipykernel/kernelapp.py", l

In [14]:
i=4399
df_glycan['glycan'][i]

'Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf(b1-5)Galf(b1-6)Galf'

In [15]:
tokens = tokenizer.tokenize(df_glycan['glycan'][i])
tokens

['galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '5',
 ')',
 'galf',
 '(',
 'b1',
 '-',
 '6',
 ')',


In [16]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122,
 5,
 81,
 8,
 14,
 6,
 122,
 5,
 81,
 8,
 15,
 6,
 122]

In [19]:
encoded_input = tokenizer(df_glycan['glycan'][i])
encoded_input

{'input_ids': [2, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 5, 81, 8, 14, 6, 122, 5, 81, 8, 15, 6, 122, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [20]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf ( b1 - 5 ) galf ( b1 - 6 ) galf [SEP]'

In [21]:
len(token_ids)

175

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, DataCollatorForLanguageModeling, RobertaConfig, RobertaForMaskedLM, Trainer, TrainingArguments

In [33]:
df_glycan['glycan'].to_csv(config['paths']['glycan_seqs'] , index=False)

In [63]:

dataset2 = load_dataset('csv', data_files=config['paths']['glycan_seqs'])  


In [64]:

tokenizer = BertTokenizer.from_pretrained('../results/misc/tokenizer/')

In [65]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['glycan'], padding='max_length', truncation=True, max_length=config['dataset']['max_length'])

In [66]:
# tokenize the dataset
tokenized_dataset = dataset2.map(tokenize_function, batched=True, remove_columns=['glycan'])


In [67]:
# Train-validation split
train_test_split = tokenized_dataset['train'].train_test_split(test_size=config['dataset']['val_size'])  # 80% train, 20% validation
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [68]:
# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=config['model']['mlm_probability']
)

In [85]:
# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=config['model']['max_position_embeddings'],
    num_attention_heads=config['model']['num_attention_heads'],
    num_hidden_layers=config['model']['num_hidden_layers'],
    hidden_size=config['model']['hidden_size'],
    type_vocab_size=config['model']['type_vocab_size']
)

# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters in the model: ',model.num_parameters())

Num parameters:  901632


In [None]:

# Define the training arguments
training_args = TrainingArguments(
    output_dir=config['paths']['model'],
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=config['training']['batcn_epochs'],
    learning_rate=config['training']['lr'],
    weight_decay=config['training']['wd'],
    per_device_train_batch_size=config['training']['batch_size'],
    per_device_eval_batch_size=config['training']['batch_size'],
    save_strategy='best',
    logging_dir=f"{config['paths']['model']}/logs"
)

In [None]:
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()