In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from transformers import BertTokenizer, AutoModel
from pathlib import Path
from tqdm import tqdm
import pickle

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.utils import load_config, seed_everything, get_embed


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/envs/glycan/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/glycan/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/envs/glycan/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File 

In [2]:
# for reproducibility
seed_everything(42)

# config file
config = load_config("../configs/config.yaml")

# Set Seaborn style
sns.set(style=config['sns_params']['style'],
        palette=config['sns_params']['palette'],
        font_scale=config['sns_params']['font_scale'])

# Set Matplotlib parameters
plt.rcParams.update(config['plt_params'])

### Read files

In [3]:
df_glycan = pd.read_pickle(config['paths']['df_glycan_path'])
df_species = pd.read_pickle(config['paths']['df_species_path'])
df_bind = pd.read_pickle(config['paths']['glycan_binding_path'])
df_glycan_list = pd.read_csv(config['paths']['glycan_list_path'])
df_Nglycan = pd.read_pickle(config['paths']['N_glycans_df_path'])


In [4]:
all_glycan_seqs = list(set(df_glycan['glycan'].tolist() +
                           df_species['glycan'].tolist() +
                           df_bind.columns.tolist() +
                           df_glycan_list['glycan'].tolist() +
                           df_Nglycan['glycan'].tolist()))

### Calculate embedding for all glycans

In [5]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(config['paths']['tokenizer'])

# Load model
model_path = config['paths']['model'] + 'checkpoint-48070/'

if Path(model_path+'model.safetensors').is_file():
    print('loading model locally')
    model = AutoModel.from_pretrained(model_path)
else:
    print('loading model from hugging face')
    model = AutoModel.from_pretrained('AliSaadatV/GlycoFormer')

Some weights of BertModel were not initialized from the model checkpoint at ../results/misc/model/checkpoint-48070/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


loading model locally


In [6]:
if Path(config['paths']['embeds']).is_file():
    with open(config['paths']['embeds'], "rb") as file:
        all_embeds = pickle.load(file)

else:
    all_embeds = {}

    for glycan_seq in tqdm(all_glycan_seqs, desc="Processing glycans"):
        try:
            glycan_embed = get_embed(glycan_seq, model, tokenizer)
            all_embeds[glycan_seq] = glycan_embed.squeeze().half().tolist()
        except Exception as e:
            print(f"Error processing {glycan_seq}: {e}")
            all_embeds[glycan_seq] = "NA"

    # Save the dictionary to a file
    with open(config['paths']['embeds'], "wb") as file:
        pickle.dump(all_embeds, file)

### Visualize

In [None]:
# Save the dictionary to a file
