# Masked Language Modeling

This notebook describes how one can pre-train their own AntiBERTa model using the HuggingFace framework. As a demo, we've included the tokenizer we've used, and 1% of the sequences that we used in our training, validation, and test sets of the paper.

## Setup

After running this shell code, you need to restart the session, and you do not need to run this one again.

In [8]:
!pip install datasets -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/471.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m471.0/471.6 kB[0m [31m19.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd "/content/drive/MyDrive/cibi_prj"

/content/drive/MyDrive/cibi_prj


In [3]:
ls ##list file in the cibi_prj folder to make you have dataset in zip file

[0m[01;34mall_structures[0m/     pair.zip                            [01;34mtest3[0m/
all_structures.zip  [01;34mparatope-prediction-task_0[0m/         [01;34mtest4[0m/
[01;34mantiberta[0m/          [01;34mparatope-prediction-task_0_2[0m/       [01;34mtest5[0m/
[01;34mAVIDa-SARS-CoV-2[0m/   [01;34mparatope-prediction-task_42[0m/        test.txt
create_dataset.py   [01;34mparatope-prediction-v2-task_0[0m/      [01;34mtmp_trainer[0m/
[01;34mdata[0m/               [01;34mparatope-prediction-v2-task_42[0m/     train.py
[01;34mdata2[0m/              [01;34mparatope-predictionv2-task_42[0m/      train.txt
[01;34mdata3[0m/              [01;34mparatope-prediction-v2-task_42_v2[0m/  [01;34munpair[0m/
dataset.py          [01;34m__pycache__[0m/                        [01;34munpaired_v2[0m/
data.zip            [01;34msaved[0m/                              unpaired_v2.zip
E-GEOD-35489.zip    [01;34msrc[0m/                                unpair.zip
[01;3

In [None]:
!unzip "/content/drive/MyDrive/cibi_prj/unpaired_v2.zip"

Archive:  /content/drive/MyDrive/cibi_prj/unpaired_v2.zip
replace unpaired/SRR13082934_1_Heavy_IGHD.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: unpaired/SRR13082934_1_Heavy_IGHD.csv  
  inflating: unpaired/SRR3106519_Heavy_Bulk.csv  
  inflating: unpaired/SRR13082934_1_Heavy_IGHE.csv  
  inflating: unpaired/SRR3106520_Heavy_Bulk.csv  
  inflating: unpaired/SRR13082934_1_Heavy_IGHG.csv  
  inflating: unpaired/SRR13082934_1_Heavy_IGHM.csv  
  inflating: unpaired/SRR13082935_1_Heavy_Bulk.csv  
  inflating: unpaired/SRR3106521_Heavy_Bulk.csv  
  inflating: unpaired/SRR13082935_1_Heavy_IGHA.csv  
  inflating: unpaired/SRR13082935_1_Heavy_IGHD.csv  
  inflating: unpaired/SRR3106522_Heavy_Bulk.csv  
  inflating: unpaired/SRR13082935_1_Heavy_IGHE.csv  
  inflating: unpaired/SRR13082935_1_Heavy_IGHG.csv  
  inflating: unpaired/SRR13082935_1_Heavy_IGHM.csv  
  inflating: unpaired/SRR13082936_1_Heavy_Bulk.csv  
  inflating: unpaired/SRR3106471_Heavy_Bulk.csv  
  inflating: unpaired/

In [3]:
from transformers import (
    RobertaConfig,
    RobertaTokenizer,
    RobertaForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
import os
import pandas as pd

In [4]:
# # Initialise the tokeniser
tokenizer = RobertaTokenizer.from_pretrained(
    "./antiberta/antibody-tokenizer"
)

# Initialise the data collator, which is necessary for batching
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


## Text Data preprocessing

In [5]:
# Read file CSV
df = pd.read_csv('./data3/unpair/ERR220397_Heavy_Bulk.csv', sep=',', skiprows=1, encoding='utf-8')

print(df.columns)
print(df.info)
print(df['sequence'])

0       NNCCCTATCCCCCTGTGTGCCTTGAGAGACGCCGACCACTCATCTC...
1       CTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTT...
2       NCCCCTATCCCCCTGTGTGCCTTGAGAGACGCCGACCACTCATCTC...
3       CGGGGAGTCTCTGAAGATCTCCTGTAAGGGTTCTGGATACAGCTTT...
4       NNNNCCTACCTCCCCTCGTGTGCCTTGAGAGACGCCGACCACTCAT...
                              ...                        
1767    CTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTT...
1768    CCCCATCCCCCTGTNTGCCCTTGAGNACGCCGACCACTCATCTCGT...
1769    CCTATCCCCTGTGTGCCTTAAGAGACGCCGCCACTCATCTCGTGAA...
1770    CTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTT...
1771    CTTCGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTC...
Name: sequence, Length: 1772, dtype: object


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import glob

# 1. Read and combine all CSV files
all_sequences = []  # Store only sequences directly

for file in glob.glob('./unpaired_v2/unpaired/*.csv'):  # Adjust the path if needed
    try:
        df = pd.read_csv(file, sep=',', skiprows=1, encoding='utf-8')

        # Check if DataFrame is empty
        if df.empty:
            print(f"File {file} is empty or unreadable")
            continue

        # Filtering based on 'stop_codon' and calculating CDR lengths
        if 'stop_codon' in df.columns:
            filtered_data = df[df['stop_codon'] == 'F']

            # Calculate CDR lengths if necessary columns are present
            if all(col in df.columns for col in ['cdr1_start', 'cdr1_end', 'cdr2_start', 'cdr2_end', 'cdr3_start', 'cdr3_end']):
                filtered_data['cdr1_length'] = filtered_data['cdr1_end'] - filtered_data['cdr1_start'] + 1
                filtered_data['cdr2_length'] = filtered_data['cdr2_end'] - filtered_data['cdr2_start'] + 1
                filtered_data['cdr3_length'] = filtered_data['cdr3_end'] - filtered_data['cdr3_start'] + 1

                # Filter based on CDR positions and lengths
                filtered_data = filtered_data[
                    (filtered_data['cdr1_start'] >= 20) &
                    (filtered_data['sequence'].str.len() - filtered_data['cdr3_end'] >= 10) &
                    (filtered_data['cdr1_length'].between(5, 12)) &
                    (filtered_data['cdr2_length'].between(1, 10)) &
                    (filtered_data['cdr3_length'].between(5, 38))
                ]

                # Extract amino acid sequences and append to all_sequences
                all_sequences.extend(filtered_data['sequence'].dropna().tolist())
            else:
                print(f"File {file} is missing necessary CDR columns.")
        else:
            print(f"File {file} is missing 'stop_codon' column.")

    except Exception as e:
        print(f"Error reading file {file}: {e}")

# 4. Split into train, validation, and test sets
train_seqs, temp_seqs = train_test_split(all_sequences, test_size=0.2, random_state=42)
val_seqs, test_seqs = train_test_split(temp_seqs, test_size=0.5, random_state=42)

# 5. Save to .txt files
for seqs, filename in zip([train_seqs, val_seqs, test_seqs], ['train.txt', 'val.txt', 'test.txt']):
    with open(filename, 'w') as f:
        for seq in seqs:
            f.write(seq + '\n')  # Only writing sequences, not additional columns

# Print the number of sequences remaining after filtering
print(f"Number of sequences after filtering: {len(all_sequences)}")


Number of sequences after filtering: 3297


In [None]:
# Cách đọc file bằng Python tiêu chuẩn
with open("./assets/train.txt", 'r') as f:
    train_txt = f.readlines()

# Loại bỏ ký tự xuống dòng "\n" ở cuối mỗi dòng
train_txt = [line.strip() for line in train_txt]

# Xem dữ liệu
print(train_txt[:5])  # Xem 5 dòng đầu tiên

# Kiểm tra số lượng dòng trong file train.txt
num_lines = len(train_txt)
print(f"Number of rows in the file train.txt: {num_lines}")


['GGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAACTATGTCTCCTGGTACCAACAACACCCCGGCAAAGCCCCCAAACTCATGATTTATGATGTCACTAATCGGCCCTCAGGGATTTCTAATCGCTTCTCTGCCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTACTACTGCAGTTCATATACAATTGGCAGAGCTGTCTTCGGAAGTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTACAAGCCAACGT', 'GCTCACTGCACAGGGTCCTGGGCCCAGTCTGTATTGACGCAGCCGCCTTCAGTGTCTGCGGCCCCCGGACAAAGGGTCACCATCTCCTGCTTTGGAAGCAGATTCAGTTCTGTCTCCTGGTACCAACAACTCCCAGGGACAGCCCCCAAACTCCTCATCTATCCAAATGGTGAGCGGCCCTCAGGCATTCCTGACCGATTTTTTGGCTCCGAGTCTGGCACGTCAGCCACCCTGGGCATCACCGCAGTCCAGACTGGGGACGAGGCCGATTATTATTGCGGAACATGGGATGACAGCCTGAGTTCTGTGGTCTTCGGCGGAGGACCAAGCTCGACCGTACCTAAGTCACGCCCGAAGGCTCGCCCCCTCGGTCACTCTCGTTCCCGCCCTCTCTAGTGGAGTCTTACAACGCCAAACGAACGGCCCAACACGTGGTCG', 'GGGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACTCTCCTCACTCAGG

In [None]:
text_datasets = {
    "train": ['./assets/train.txt'],
    "eval": ['./assets/val.txt'],
    "test": ['./assets/test.txt']
}

dataset = load_dataset("text", data_files=text_datasets)
tokenized_dataset = dataset.map(
    lambda z: tokenizer(
        z["text"],
        padding="max_length",
        truncation=True,
        max_length=150,
        return_special_tokens_mask=True,
    ),
    batched=True,
    num_proc=1,
    remove_columns=["text"],
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2637 [00:00<?, ? examples/s]

Map:   0%|          | 0/330 [00:00<?, ? examples/s]

Map:   0%|          | 0/330 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset['train']

Dataset({
    features: ['input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 2637
})

## Model configuration

In [5]:
# These are the cofigurations we've used for pre-training.
antiberta_config = {
    "num_hidden_layers": 12,
    "num_attention_heads": 12,
    "hidden_size": 768,
    "d_ff": 3072,
    "vocab_size": 25,
    "max_len": 150,
    "max_position_embeddings": 152,
    "batch_size": 32,
    "max_steps": 5000,
    "weight_decay": 0.01,
    "peak_learning_rate": 0.0001,
}


In [6]:
model_config = RobertaConfig(
    vocab_size=antiberta_config.get("vocab_size"),
    hidden_size=antiberta_config.get("hidden_size"),
    max_position_embeddings=antiberta_config.get("max_position_embeddings"),
    num_hidden_layers=antiberta_config.get("num_hidden_layers", 12),
    num_attention_heads=antiberta_config.get("num_attention_heads", 12),
    type_vocab_size=1,
)
model = RobertaForMaskedLM(model_config)

In [None]:
#training arguments
args = TrainingArguments(
    output_dir="test5", ##change your output directory to save checkpoint
    overwrite_output_dir=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    max_steps=5000,
    save_steps=200,
    logging_steps=200,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
    weight_decay=0.01,
    warmup_steps=100,
    learning_rate=1e-4,
    gradient_accumulation_steps=1,
    fp16=True,
    evaluation_strategy="steps",
    seed=42
)



## Setup of the HuggingFace Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    data_collator=collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
200,1.0209,0.421411
400,0.2809,0.222352
600,0.1835,0.175487
800,0.149,0.145062
1000,0.1365,0.135046
1200,0.1221,0.134074
1400,0.1173,0.118166
1600,0.1119,0.11494
1800,0.1042,0.119441
2000,0.1012,0.113656


TrainOutput(global_step=5000, training_loss=0.13766126956939698, metrics={'train_runtime': 772.5428, 'train_samples_per_second': 207.108, 'train_steps_per_second': 6.472, 'total_flos': 2.4165821771208e+16, 'train_loss': 0.13766126956939698, 'epoch': 60.24096385542169})

In [None]:
trainer.save_model(options.dir)

In [None]:
out = trainer.predict(tokenized_dataset['test'])