In [11]:
from transformers import BertTokenizer, BertForMaskedLM, BertConfig
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset

from experiments.data import samples
from scripts.data.filter_dataset import human_dataset
from src.dataset import load_benchmark_dataset, Species, Modification

hdataset = load_benchmark_dataset(Species.human, Modification.psi)

sequences = hdataset.samples['sequence'].values.tolist()
labels = hdataset.targets.values.tolist()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(sequences, return_tensors='pt', padding=True, truncation=True, max_length=21)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

dataset = Dataset.from_dict({"input_ids": inputs['input_ids'], "labels": inputs['input_ids']})

config = BertConfig(vocab_size=tokenizer.vocab_size, max_position_embeddings=21)
model = BertForMaskedLM(config)

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,3.8649


TrainOutput(global_step=620, training_loss=3.671775424095892, metrics={'train_runtime': 168.1611, 'train_samples_per_second': 29.436, 'train_steps_per_second': 3.687, 'total_flos': 53437774005000.0, 'train_loss': 3.671775424095892, 'epoch': 5.0})

In [21]:
from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset, load_metric
import torch

sequences = hdataset.samples['sequence'].values.tolist()
targets = hdataset.targets.values.tolist()

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(sequences, return_tensors='pt', padding=True, truncation=True, max_length=21)

# Masked Language Modeling data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Create self-supervised dataset
self_supervised_dataset = Dataset.from_dict({"input_ids": inputs['input_ids']})

# Define MLM model
mlm_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Training arguments for self-supervised learning
mlm_training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer for self-supervised learning
mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_training_args,
    data_collator=data_collator,
    train_dataset=self_supervised_dataset,
)

# Self-supervised training
mlm_trainer.train()
mlm_model.save_pretrained('./pretrained_mlm_model')

# Fine-tuning dataset
fine_tune_dataset = Dataset.from_dict({
    "input_ids": inputs['input_ids'],
    "labels": torch.tensor(targets)
})

# Define fine-tuning model
fine_tuning_model = BertForSequenceClassification.from_pretrained('./pretrained_mlm_model', num_labels=2)

# Training arguments for fine-tuning
fine_tuning_args = TrainingArguments(
    output_dir='./fine_tuned_results',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch"
)

# Trainer for fine-tuning
fine_tuning_trainer = Trainer(
    model=fine_tuning_model,
    args=fine_tuning_args,
    train_dataset=fine_tune_dataset,
    eval_dataset=fine_tune_dataset,  # Correctly define eval_dataset here
)

# Supervised fine-tuning
fine_tuning_trainer.train()
fine_tuning_model.save_pretrained('./fine_tuned_classification_model')

# Load metric
metric = load_metric("accuracy")

# Evaluate the model
eval_results = fine_tuning_trainer.evaluate()

# Compute accuracy
predictions, labels, _ = fine_tuning_trainer.predict(fine_tune_dataset)
predictions = predictions.argmax(axis=1)
accuracy = metric.compute(predictions=predictions, references=labels)

print(f"Accuracy: {accuracy['accuracy']:.4f}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,2.5393


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./pretrained_mlm_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.683739
2,No log,0.694044
3,No log,0.670669
4,No log,0.633731
5,0.691300,0.607633


Accuracy: 0.6747
