## ProtBERT pre-trained model

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load protein sequences from files
def load_data(file_path):
    with open(file_path, 'r') as file:
        sequences = file.readlines()
    return sequences

# Tokenize and prepare the dataset
class ProteinDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return {'sequence': self.sequences[idx], 'label': self.labels[idx]}

# Load protein sequences and labels
moonlight_sequences = load_data('moonlight.fasta')
non_moonlight_sequences = load_data('nonMP.fasta')

# Assign labels (1 for moonlight, 0 for non-moonlight)
moonlight_labels = [1] * len(moonlight_sequences)
non_moonlight_labels = [0] * len(non_moonlight_sequences)

# Concatenate sequences and labels
all_sequences = moonlight_sequences + non_moonlight_sequences
all_labels = moonlight_labels + non_moonlight_labels

# Split the data into training and testing sets
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    all_sequences, all_labels, test_size=0.2, random_state=42
)

# Tokenize sequences
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)  # Example protein BERT model
train_encodings = tokenizer(train_sequences, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_sequences, truncation=True, padding=True, return_tensors='pt')

# Create datasets
train_dataset = ProteinDataset(train_sequences, train_labels)
test_dataset = ProteinDataset(test_sequences, test_labels)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize and fine-tune the model
model = BertForSequenceClassification.from_pretrained('Rostlab/prot_bert', num_labels=2)  # Two classes: moonlight and non-moonlight
optimizer = AdamW(model.parameters(), lr=1e-5)

# Fine-tune the model
model.train()
for epoch in range(10):  # You may need to adjust the number of epochs based on your data
    for batch in train_loader:
        inputs = tokenizer(batch['sequence'], truncation=True, padding=True, return_tensors='pt', max_length=512)
        labels = torch.tensor(batch['label'])
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        inputs = tokenizer(batch['sequence'], truncation=True, padding=True, return_tensors='pt', max_length=512)
        labels = torch.tensor(batch['label'])
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(logits.argmax(dim=1).cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  labels = torch.tensor(batch['label'])


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

# Load the combined dataset
dataset = pd.read_csv('combined_dataset.csv')

# Drop the 'Name' column
dataset = dataset.drop('Name', axis=1)

# Define the features and labels
X = dataset.drop('Label', axis=1)
y = dataset['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Concatenate amino acid sequences into a text sequence
X_train_text = X_train.astype(str).apply(' '.join, axis=1).values
X_test_text = X_test.astype(str).apply(' '.join, axis=1).values

# Load pre-trained ProtBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd')
model = BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')

# Tokenize and encode the training data
X_train_tokens = tokenizer(list(X_train_text), padding=True, truncation=True, return_tensors='pt', max_length=512)
y_train_tensor = torch.tensor(y_train.values)

# Tokenize and encode the testing data
X_test_tokens = tokenizer(list(X_test_text), padding=True, truncation=True, return_tensors='pt', max_length=512)
y_test_tensor = torch.tensor(y_test.values)

# Create DataLoader for training and testing data
train_dataset = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], y_train_tensor)
test_dataset = TensorDataset(X_test_tokens['input_ids'], X_test_tokens['attention_mask'], y_test_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Set up training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

# Calculate accuracy
accuracy = accuracy_score(y_test, all_preds)
print(f'Test Accuracy: {accuracy}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.5915492957746479


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load protein sequences from files
def load_data(file_path):
    with open(file_path, 'r') as file:
        sequences = file.readlines()
    return sequences

# Tokenize and prepare the dataset
class ProteinDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return {'sequence': self.sequences[idx], 'label': self.labels[idx]}

# Load protein sequences and labels
moonlight_sequences = load_data('moonlight.fasta')
non_moonlight_sequences = load_data('nonMP.fasta')

# Assign labels (1 for moonlight, 0 for non-moonlight)
moonlight_labels = [1] * len(moonlight_sequences)
non_moonlight_labels = [0] * len(non_moonlight_sequences)

# Concatenate sequences and labels
all_sequences = moonlight_sequences + non_moonlight_sequences
all_labels = moonlight_labels + non_moonlight_labels

# Split the data into training and testing sets
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    all_sequences, all_labels, test_size=0.2, random_state=42
)

# Tokenize sequences using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_sequences, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_sequences, truncation=True, padding=True, return_tensors='pt')

# Create datasets
train_dataset = ProteinDataset(train_sequences, train_labels)
test_dataset = ProteinDataset(test_sequences, test_labels)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize and fine-tune the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=1e-5)

# Fine-tune the model
model.train()
for epoch in range(3):  # You may need to adjust the number of epochs based on your data
    for batch in train_loader:
        inputs = tokenizer(batch['sequence'], truncation=True, padding=True, return_tensors='pt', max_length=512)
        labels = torch.tensor(batch['label'])
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        inputs = tokenizer(batch['sequence'], truncation=True, padding=True, return_tensors='pt', max_length=512)
        labels = torch.tensor(batch['label'])
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(logits.argmax(dim=1).cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")
