In [2]:
!pip install datasets evaluate accelerate rjieba



In [3]:
import pandas as pd
import os
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, RoFormerForSequenceClassification, RoFormerTokenizer
import torch
import numpy as np
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from datasets import Dataset
from evaluate import load

In [3]:
#max_len = 157

In [4]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [5]:
!git clone https://github.com/adaptyvbio/lemanic_2024.git

fatal: destination path 'lemanic_2024' already exists and is not an empty directory.


In [6]:
#Get the data sets from literature and experiment
root = "lemanic_2024/data"
path_exp_train = root + "/experiment_train.csv"
path_exp_test =  root + "/experiment_test.csv"
path_lit_train = root + "/literature_train.csv"
path_lit_test =  root + "/literature_test.csv"

experiment_train_df = pd.read_csv(path_exp_train).dropna(subset="VHorVHH")
experiment_test_df = pd.read_csv(path_exp_test).dropna(subset="VHorVHH")
literature_train_df = pd.read_csv(path_lit_train).dropna(subset="VHorVHH")
literature_test_df = pd.read_csv(path_lit_test).dropna(subset="VHorVHH")

experiment_train_df['ID'] = ['id' + str(i) for i in range(1, len(experiment_train_df)+1)]
experiment_test_df['ID'] = ['id' + str(i) for i in range(1, len(experiment_test_df)+1)]
literature_train_df['ID'] = ['id' + str(i) for i in range(1, len(literature_train_df)+1)]
literature_test_df['ID'] = ['id' + str(i) for i in range(1, len(literature_test_df)+1)]

In [7]:
#prepare training seq
expe_train_Hseqs = list(experiment_train_df["VHorVHH"].values)
expe_test_Hseqs = list(experiment_test_df["VHorVHH"].values)
lit_train_Hseqs = list(literature_train_df["VHorVHH"].values)
lit_test_Hseqs = list(literature_test_df["VHorVHH"].values)

def transform_string(s):

    return ' '.join(s)

expe_train_Hseqs = [transform_string(s) for s in expe_train_Hseqs]
expe_test_Hseqs = [transform_string(s) for s in expe_test_Hseqs]
lit_train_Hseqs = [transform_string(s) for s in lit_train_Hseqs]
lit_test_Hseqs = [transform_string(s) for s in lit_test_Hseqs]

#prepare labels
y_lit_test = list(np.array(literature_test_df["Binds"].values).astype(int))
y_lit_train = list(np.array(literature_train_df["Binds"].values).astype(int))
y_exp_test = list(np.array(experiment_test_df["Binds"].values).astype(int))
y_exp_train = list(np.array(experiment_train_df["Binds"].values).astype(int))

In [8]:
#tokenizer and model

tokenizer = RoFormerTokenizer.from_pretrained('alchemab/antiberta2')#, truncation=True, max_length=max_len)
model_exp = RoFormerForSequenceClassification.from_pretrained('alchemab/antiberta2', num_labels=2) #.resize_token_embeddings(len(tokenizer))
model_lit = RoFormerForSequenceClassification.from_pretrained('alchemab/antiberta2', num_labels=2) #.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/811M [00:00<?, ?B/s]

Some weights of RoFormerForSequenceClassification were not initialized from the model checkpoint at alchemab/antiberta2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RoFormerForSequenceClassification were not initialized from the model checkpoint at alchemab/antiberta2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
#tokenize
expe_train_Hseqs_tokenized = tokenizer(expe_train_Hseqs)
expe_test_Hseqs_tokenized = tokenizer(expe_test_Hseqs)

lit_train_Hseqs_tokenized = tokenizer(lit_train_Hseqs)
lit_test_Hseqs_tokenized = tokenizer(lit_test_Hseqs)

In [10]:
#create dataset
expe_train_Hseqs_dataset = Dataset.from_dict(expe_train_Hseqs_tokenized).add_column("labels", y_exp_train)
expe_test_Hseqs_dataset = Dataset.from_dict(expe_test_Hseqs_tokenized).add_column("labels", y_exp_test)

lit_train_Hseqs_dataset = Dataset.from_dict(lit_train_Hseqs_tokenized).add_column("labels", y_lit_train)
lit_test_Hseqs_dataset = Dataset.from_dict(lit_test_Hseqs_tokenized).add_column("labels", y_lit_test)

In [11]:
#train arg
batch_size = 8

args_exp = TrainingArguments(
    f"AbLang-finetuned-exp",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    remove_unused_columns=False
)

args_lit = TrainingArguments(
    f"AbLang-finetuned-lit",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model="f1",
    push_to_hub=False,
    remove_unused_columns=False
)

In [12]:
#metrics
metric = load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


In [13]:
# trainer
trainer_exp = Trainer(
    model_exp.to("cuda"),
    args_exp,
    train_dataset=expe_train_Hseqs_dataset,
    eval_dataset=expe_test_Hseqs_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_lit = Trainer(
    model_lit.to("cuda"),
    args_lit,
    train_dataset=lit_train_Hseqs_dataset,
    eval_dataset=lit_test_Hseqs_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [14]:
trainer_exp.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.988116,0.0
2,No log,0.646842,0.131579
3,No log,0.895669,0.263736
4,0.169700,1.179036,0.102564
5,0.169700,1.164823,0.102564


TrainOutput(global_step=750, training_loss=0.12545919545491535, metrics={'train_runtime': 242.4429, 'train_samples_per_second': 24.707, 'train_steps_per_second': 3.094, 'total_flos': 954515817023616.0, 'train_loss': 0.12545919545491535, 'epoch': 5.0})

In [15]:
trainer_lit.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,1.917929,0.278302
2,0.394900,1.874437,0.31339
3,0.394900,3.23807,0.32106
4,0.206500,3.780032,0.333333
5,0.101300,3.837049,0.322086


TrainOutput(global_step=1500, training_loss=0.23421121724446614, metrics={'train_runtime': 478.2175, 'train_samples_per_second': 25.02, 'train_steps_per_second': 3.137, 'total_flos': 1907146300964940.0, 'train_loss': 0.23421121724446614, 'epoch': 5.0})

###Only using CDRH

In [4]:
#Get the data sets from literature and experiment
root = "lemanic_2024/data"
path_exp_train = root + "/experiment_train.csv"
path_exp_test =  root + "/experiment_test.csv"
path_lit_train = root + "/literature_train.csv"
path_lit_test =  root + "/literature_test.csv"

experiment_train_df = pd.read_csv(path_exp_train).dropna(subset="CDRH3")
experiment_test_df = pd.read_csv(path_exp_test).dropna(subset="CDRH3")
literature_train_df = pd.read_csv(path_lit_train).dropna(subset="CDRH3")
literature_test_df = pd.read_csv(path_lit_test).dropna(subset="CDRH3")

experiment_train_df['ID'] = ['id' + str(i) for i in range(1, len(experiment_train_df)+1)]
experiment_test_df['ID'] = ['id' + str(i) for i in range(1, len(experiment_test_df)+1)]
literature_train_df['ID'] = ['id' + str(i) for i in range(1, len(literature_train_df)+1)]
literature_test_df['ID'] = ['id' + str(i) for i in range(1, len(literature_test_df)+1)]

In [5]:
#prepare training seq
expe_train_Hseqs = list(experiment_train_df["CDRH3"].values)
expe_test_Hseqs = list(experiment_test_df["CDRH3"].values)
lit_train_Hseqs = list(literature_train_df["CDRH3"].values)
lit_test_Hseqs = list(literature_test_df["CDRH3"].values)

def transform_string(s):

    return ' '.join(s)

expe_train_Hseqs = [transform_string(s) for s in expe_train_Hseqs]
expe_test_Hseqs = [transform_string(s) for s in expe_test_Hseqs]
lit_train_Hseqs = [transform_string(s) for s in lit_train_Hseqs]
lit_test_Hseqs = [transform_string(s) for s in lit_test_Hseqs]

#prepare labels
y_lit_test = list(np.array(literature_test_df["Binds"].values).astype(int))
y_lit_train = list(np.array(literature_train_df["Binds"].values).astype(int))
y_exp_test = list(np.array(experiment_test_df["Binds"].values).astype(int))
y_exp_train = list(np.array(experiment_train_df["Binds"].values).astype(int))

In [6]:
#prepare training seq
expe_train_Hseqs = list(experiment_train_df["CDRH3"].values)
expe_test_Hseqs = list(experiment_test_df["CDRH3"].values)
lit_train_Hseqs = list(literature_train_df["CDRH3"].values)
lit_test_Hseqs = list(literature_test_df["CDRH3"].values)

def transform_string(s):

    return ' '.join(s)

expe_train_Hseqs = [transform_string(s) for s in expe_train_Hseqs]
expe_test_Hseqs = [transform_string(s) for s in expe_test_Hseqs]
lit_train_Hseqs = [transform_string(s) for s in lit_train_Hseqs]
lit_test_Hseqs = [transform_string(s) for s in lit_test_Hseqs]

#prepare labels
y_lit_test = list(np.array(literature_test_df["Binds"].values).astype(int))
y_lit_train = list(np.array(literature_train_df["Binds"].values).astype(int))
y_exp_test = list(np.array(experiment_test_df["Binds"].values).astype(int))
y_exp_train = list(np.array(experiment_train_df["Binds"].values).astype(int))

In [7]:
#tokenizer and model

tokenizer = RoFormerTokenizer.from_pretrained('alchemab/antiberta2')#, truncation=True, max_length=max_len)
model_exp = RoFormerForSequenceClassification.from_pretrained('alchemab/antiberta2', num_labels=2) #.resize_token_embeddings(len(tokenizer))
model_lit = RoFormerForSequenceClassification.from_pretrained('alchemab/antiberta2', num_labels=2) #.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RoFormerForSequenceClassification were not initialized from the model checkpoint at alchemab/antiberta2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RoFormerForSequenceClassification were not initialized from the model checkpoint at alchemab/antiberta2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_pro

In [8]:
#tokenize
expe_train_Hseqs_tokenized = tokenizer(expe_train_Hseqs)
expe_test_Hseqs_tokenized = tokenizer(expe_test_Hseqs)

lit_train_Hseqs_tokenized = tokenizer(lit_train_Hseqs)
lit_test_Hseqs_tokenized = tokenizer(lit_test_Hseqs)

In [9]:
#create dataset
expe_train_Hseqs_dataset = Dataset.from_dict(expe_train_Hseqs_tokenized).add_column("labels", y_exp_train)
expe_test_Hseqs_dataset = Dataset.from_dict(expe_test_Hseqs_tokenized).add_column("labels", y_exp_test)

lit_train_Hseqs_dataset = Dataset.from_dict(lit_train_Hseqs_tokenized).add_column("labels", y_lit_train)
lit_test_Hseqs_dataset = Dataset.from_dict(lit_test_Hseqs_tokenized).add_column("labels", y_lit_test)

In [11]:
#train arg
batch_size = 8

args_exp = TrainingArguments(
    f"AbLang-finetuned-CDRH3-exp",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    remove_unused_columns=False
)

args_lit = TrainingArguments(
    f"AbLang-finetuned-CDRH3-lit",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model="f1",
    push_to_hub=False,
    remove_unused_columns=False
)

In [12]:
#metrics
metric = load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


In [13]:
# trainer
trainer_exp = Trainer(
    model_exp.to("cuda"),
    args_exp,
    train_dataset=expe_train_Hseqs_dataset,
    eval_dataset=expe_test_Hseqs_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_lit = Trainer(
    model_lit.to("cuda"),
    args_lit,
    train_dataset=lit_train_Hseqs_dataset,
    eval_dataset=lit_test_Hseqs_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [14]:
trainer_exp.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,1.078328,0.0
2,No log,0.711633,0.0
3,No log,0.791581,0.027778
4,0.242400,1.302083,0.025641
5,0.242400,1.469353,0.027027


TrainOutput(global_step=750, training_loss=0.18258979670206707, metrics={'train_runtime': 131.0243, 'train_samples_per_second': 45.717, 'train_steps_per_second': 5.724, 'total_flos': 174668634382248.0, 'train_loss': 0.18258979670206707, 'epoch': 5.0})

In [15]:
trainer_lit.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,1.913672,0.276596
2,0.446700,1.776907,0.282178
3,0.446700,2.144703,0.282421
4,0.301100,3.259022,0.29453
5,0.149500,3.623622,0.293103


TrainOutput(global_step=1500, training_loss=0.29911684163411456, metrics={'train_runtime': 235.5657, 'train_samples_per_second': 50.793, 'train_steps_per_second': 6.368, 'total_flos': 346666684121172.0, 'train_loss': 0.29911684163411456, 'epoch': 5.0})