<a href="https://colab.research.google.com/github/ArthurWallaceIFB/AnaliseAlgoritmos_Projeto1/blob/main/AMP_BERT_Fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install accelerate -U



In [None]:
import os
import pandas as pd
import numpy as np
import torch
import re

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader

torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from transformers import AutoTokenizer, Trainer, TrainingArguments, AdamW

NameError: ignored

In [None]:
class amp_data():
    def __init__(self, df, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.max_len = max_len
        self.seqs, self.labels = self.get_seqs_labels()

    def get_seqs_labels(self):
        seqs = list(df['aa_seq'])
        labels = list(df['AMP'].astype(int))

        return seqs, labels

    def __len__(self):

        return len(self.labels)

    def __getitem__(self, idx):

        seq = " ".join("".join(self.seqs[idx].split()))
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_len)
        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])


        return sample

In [None]:
# define the necessary metrics for performance evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
#     conf = confusion_matrix(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
#         'confusion matrix': conf
    }

# Treinamento para AMP e Não-AMP

In [None]:
# read in the train dataset
# create an amp_data class of the dataset

data_url = 'https://raw.githubusercontent.com/GIST-CSBL/AMP-BERT/main/all_veltri.csv'
df = pd.read_csv(data_url, index_col = 0)
df = df.sample(frac=1, random_state = 0)
print(df.head(7))

train_dataset = amp_data(df)

In [None]:
# define the model initializing function for Trainer in huggingface

def model_init():
    return BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')

In [None]:
# training on entire data
# no evaluation/validation

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    learning_rate = 5e-5,
    per_device_train_batch_size=1,
    warmup_steps=0,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="no",
    save_strategy='no',
    gradient_accumulation_steps=64,
    fp16=True,
    fp16_opt_level="O2",
    run_name="AMP-BERT",
    seed=0,
    load_best_model_at_end = True
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics = compute_metrics,
)

trainer.train()

In [None]:
# performance metrics on the training data itself

predictions, label_ids, metrics = trainer.predict(train_dataset)
metrics

In [None]:
# save the model, if desired
from google.colab import drive
drive.mount('/content/drive')

trainer.save_model('/content/drive/MyDrive/Colab Notebooks/AMP-BERT/Fine-tuned_model/')

In [None]:
# predict AMP/non-AMP for a single example

# IMPORTANT:
# one must mount their Google Drive and load their own fine-tuned model before running the below cell for individual predictions
from google.colab import drive
drive.mount('/content/drive')

# load appropriate tokenizer and fine-tuned model
tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/AMP-BERT/Fine-tuned_model/")

In [None]:
# predict AMP/non-AMP for a single example (default ex. is from external test data: DRAMP00126)

#@markdown **Input peptide sequence (upper case only)**
input_seq = 'DLIPTSSKLVVRDTSLQVKKAFFALVT' #@param {type:"string"}
input_seq_spaced = ' '.join([ input_seq[i:i+1] for i in range(0, len(input_seq), 1) ])
input_seq_spaced = re.sub(r'[UZOB]', 'X', input_seq_spaced)
input_seq_tok = tokenizer(input_seq_spaced, return_tensors = 'pt')

output = model(**input_seq_tok)
logits = output[0]

# extract AMP class probability and make binary prediction
y_prob = torch.sigmoid(logits)[:,1].detach().numpy()
y_pred = y_prob > 0.5
if y_pred == True:
  input_class = 'AMP'
else:
  input_class = 'non-AMP'

print('Input peptide sequence: ' + input_seq)
print('Class prediction: ' + input_class)

# Fine Tuning

**FINE-TUNING PARA O MÉTODO MASK DO BART**

In [None]:
data_url = 'https://raw.githubusercontent.com/GIST-CSBL/AMP-BERT/main/all_veltri.csv'
df = pd.read_csv(data_url, index_col = 0)
df = df.sample(frac=1, random_state = 0)
# print(df.head(7))

new_df = df[df['AMP'] == True]

fine_tuning_dataset = amp_data(new_df)
print(fine_tuning_dataset)

In [None]:
new_df

In [None]:
# define the model initializing function for Trainer in huggingface
from transformers import BertForMaskedLM, BertTokenizer, pipeline, AutoModelForMaskedLM
def model_init_amp():
    return AutoModelForMaskedLM.from_pretrained('Rostlab/prot_bert_bfd')

In [None]:
# define the necessary metrics for performance evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
#     conf = confusion_matrix(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
#         'confusion matrix': conf
    }

In [None]:
# TREINAMENTO PARA O DATASET DE FINE TUNING - APENAS PARA OS CASOS POSITIVOS DE AMP
%debug
fine_tuning_args = TrainingArguments(
    output_dir='./results_fine_tuning_amp',
    num_train_epochs=3,
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    warmup_steps=0,
    weight_decay=0.1,
    logging_dir='./logs_fine_tuning_amp',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="no",
    save_strategy='no',
    gradient_accumulation_steps=64,
    fp16=True,
    fp16_opt_level="O2",
    run_name="AMP-BERT-fine-tuning",
    seed=0,
    load_best_model_at_end=True
)

trainer_fine_tuning = Trainer(
    model_init=model_init_amp,  # Utilizar o mesmo modelo inicializado anteriormente
    args=fine_tuning_args,
    train_dataset=fine_tuning_dataset,  # Utilizar o dataset de fine-tuning
    compute_metrics=compute_metrics,
)

trainer_fine_tuning.train()

In [None]:
# save the model, if desired
from google.colab import drive
drive.mount('/content/drive')

trainer_fine_tuning.save_model('/content/drive/MyDrive/AMP-BERT/Fine-tuned_model-AMP/')

In [None]:
# predict AMP/non-AMP for a single example

# IMPORTANT:
# one must mount their Google Drive and load their own fine-tuned model before running the below cell for individual predictions
# from google.colab import drive
# drive.mount('/content/drive')

# load appropriate tokenizer and fine-tuned model
# tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)
# model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/AMP-BERT/Fine-tuned_model-AMP")
from transformers import BertTokenizer, BertForMaskedLM
tokenizer_fine = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)
model_fine = BertForMaskedLM.from_pretrained("/content/drive/MyDrive/AMP-BERT/Fine-tuned_model-AMP")

In [None]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
# tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
# model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert")
unmasker = pipeline('fill-mask', model=model_fine, tokenizer=tokenizer_fine)
input_seq = 'D L I P T S S K L V V [MASK] D T S L Q V K K A F F A L V T' #@param {type:"string"}
unmasker(input_seq)