In [5]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from Bio import SeqIO
import random

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess your dataset
class CustomDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = str(self.sequences[idx].seq)
        label = int(self.labels[idx])

        # Data Augmentation: Apply random mutations
        sequence = self.apply_random_mutations(sequence)

        encoding = self.tokenizer(sequence, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

    def apply_random_mutations(self, sequence):
        # Implement your own data augmentation logic here
        # For simplicity, you can randomly replace some amino acids with others
        mutated_sequence = list(sequence)
        for i in range(len(mutated_sequence)):
            if random.random() < 0.1:  # Probability of mutation: 10%
                mutated_sequence[i] = random.choice('ACDEFGHIKLMNPQRSTVWY')

        return ''.join(mutated_sequence)

# Load your datasets
moonlight_sequences = list(SeqIO.parse("moonlight.fasta", "fasta"))
non_moonlight_sequences = list(SeqIO.parse("nonMP.fasta", "fasta"))

moonlight_labels = [1] * len(moonlight_sequences)
non_moonlight_labels = [0] * len(non_moonlight_sequences)

all_sequences = moonlight_sequences + non_moonlight_sequences
all_labels = moonlight_labels + non_moonlight_labels

print(all_sequences)

# train_sequences, test_sequences, train_labels, test_labels = train_test_split(
#     all_sequences, all_labels, test_size=0.2, random_state=42
# )
# 
# train_sequences, val_sequences, train_labels, val_labels = train_test_split(
#     train_sequences, train_labels, test_size=0.2, random_state=42
# )
# 
# # Create datasets
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# 
# train_dataset = CustomDataset(train_sequences, train_labels, tokenizer, max_length=256)  # Increase max_length
# val_dataset = CustomDataset(val_sequences, val_labels, tokenizer, max_length=256)  # Increase max_length
# test_dataset = CustomDataset(test_sequences, test_labels, tokenizer, max_length=256)  # Increase max_length
# 
# # Calculate class weights for data balancing
# class_weights = torch.tensor([1.0 / sum(train_labels), 1.0 / (len(train_labels) - sum(train_labels))], dtype=torch.float32)
# class_weights = class_weights.to(device)
# 
# # Model
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
# model.to(device)
# 
# # Optimizer and scheduler
# optimizer = AdamW(model.parameters(), lr=5e-6, weight_decay=1e-2)  # Experiment with a lower learning rate
# total_steps = len(train_dataset) * 10  # Increase the number of epochs
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
# 
# # Training loop
# for epoch in range(10):  # Increase the number of epochs
#     model.train()
#     total_loss = 0
# 
#     for batch in DataLoader(train_dataset, batch_size=8, shuffle=True):
#         inputs = batch['input_ids'].to(device)
#         labels = batch['labels'].to(device)
# 
#         optimizer.zero_grad()
#         outputs = model(inputs, labels=labels)
#         loss = outputs.loss
# 
#         # Apply class weights for data balancing
#         loss = (loss * class_weights[labels]).mean()
# 
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# 
#         optimizer.step()
#         scheduler.step()
# 
#         total_loss += loss.item()
# 
#     average_loss = total_loss / len(train_dataset)
#     print(f'Epoch {epoch + 1}/10, Average Training Loss: {average_loss}')
# 
#     # Validation
#     model.eval()
#     val_predictions = []
#     val_true_labels = []
# 
#     for val_batch in DataLoader(val_dataset, batch_size=8, shuffle=False):
#         val_inputs = val_batch['input_ids'].to(device)
#         val_labels = val_batch['labels'].to(device)
# 
#         with torch.no_grad():
#             val_outputs = model(val_inputs)
# 
#         logits = val_outputs.logits
#         predictions = torch.argmax(logits, dim=1)
#         val_predictions.extend(predictions.cpu().numpy())
#         val_true_labels.extend(val_labels.cpu().numpy())
# 
#     val_accuracy = accuracy_score(val_true_labels, val_predictions)
#     print(f'Epoch {epoch + 1}/10, Validation Accuracy: {val_accuracy}')
# 
# # Testing
# model.eval()
# test_predictions = []
# test_true_labels = []
# 
# for test_batch in DataLoader(test_dataset, batch_size=8, shuffle=False):
#     test_inputs = test_batch['input_ids'].to(device)
#     test_labels = test_batch['labels'].to(device)
# 
#     with torch.no_grad():
#         test_outputs = model(test_inputs)
# 
#     logits = test_outputs.logits
#     predictions = torch.argmax(logits, dim=1)
#     test_predictions.extend(predictions.cpu().numpy())
#     test_true_labels.extend(test_labels.cpu().numpy())
# 
# test_accuracy = accuracy_score(test_true_labels, test_predictions)
# print(f'Test Accuracy: {test_accuracy}')


AttributeError: 'list' object has no attribute 'seq'

In [4]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from Bio import SeqIO


def load_and_preprocess_data(file_path):
    sequences = []
    with open(file_path, "r") as fasta_file:
        for record in SeqIO.parse(fasta_file, "fasta"):
            # Assuming the protein sequences are stored in the 'sequence' attribute
            sequences.append(str(record.seq))
    return sequences

# Load and preprocess your data
moonlight_data = load_and_preprocess_data("moonlight.fasta")
non_moonlight_data = load_and_preprocess_data("nonMP.fasta")

# Create labels for your data (1 for moonlight, 0 for non-moonlight)
labels = [1] * len(moonlight_data) + [0] * len(non_moonlight_data)

# Combine the data
all_data = moonlight_data + non_moonlight_data

# Oversample the minority class to match the majority class
oversampler = RandomOverSampler(sampling_strategy="minority", random_state=42)

X_data = np.array(all_data).reshape(-1, 1)
y_data = np.array(labels)

# Oversample the minority class to match the majority class
X_resampled, y_resampled = oversampler.fit_resample(X_data, y_data)

# Convert the reshaped data back to lists
all_data_resampled = X_resampled.flatten().tolist()
labels_resampled = y_resampled.tolist()

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_data_resampled, labels_resampled, test_size=0.2, random_state=42)

# Load the pre-trained model and tokenizer
model_name = "Rostlab/prot_bert_bfd"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize and encode the protein sequences
X_train_encoded = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")
X_test_encoded = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt")

# Convert labels to PyTorch tensors
y_train_tensor = torch.tensor(y_train)
y_test_tensor = torch.tensor(y_test)

# Fine-tune the pre-trained model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 10

for epoch in range(epochs):
    # Train the model
    outputs = model(**X_train_encoded, labels=y_train_tensor.unsqueeze(1))
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

# Evaluate the model on the test set
with torch.no_grad():
    outputs = model(**X_test_encoded)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

# Convert predictions to a list
predictions = predictions.tolist()

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, predictions))


ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [2]:
if torch.cuda.is_available() :
    print("GPU")
else :
    print("CPU")

CPU


In [6]:
import pandas as pd
import random

# Load the datasets
moonlight_data = pd.read_csv('Moonlight.csv')
nonmoonlight_data = pd.read_csv('NonMoonLight.csv')

# Add a column indicating moonlight (1) or non-moonlight (0)
moonlight_data['Label'] = 1
nonmoonlight_data['Label'] = 0

# Concatenate the datasets
combined_data = pd.concat([moonlight_data, nonmoonlight_data])

# Randomize the order of rows
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

# Save the combined and randomized dataset to a new CSV file
combined_data.to_csv('combined_dataset.csv', index=False)


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Load the combined dataset
dataset = pd.read_csv('combined_dataset.csv')

# Drop the 'Name' column
dataset = dataset.drop('Name', axis=1)

# Define the features and labels
X = dataset.drop('Label', axis=1)
y = dataset['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the XGBoost model
model = XGBClassifier()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy}')


Test Accuracy: 0.704225352112676


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

# Load the combined dataset
dataset = pd.read_csv('combined_dataset.csv')

# Drop the 'Name' column
dataset = dataset.drop('Name', axis=1)

# Define the features and labels
X = dataset.drop('Label', axis=1)
y = dataset['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Concatenate amino acid sequences into a text sequence
X_train_text = X_train.astype(str).apply(' '.join, axis=1).values
X_test_text = X_test.astype(str).apply(' '.join, axis=1).values

# Load pre-trained ProtBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd')
model = BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')

# Tokenize and encode the training data
X_train_tokens = tokenizer(list(X_train_text), padding=True, truncation=True, return_tensors='pt', max_length=512)
y_train_tensor = torch.tensor(y_train.values)

# Tokenize and encode the testing data
X_test_tokens = tokenizer(list(X_test_text), padding=True, truncation=True, return_tensors='pt', max_length=512)
y_test_tensor = torch.tensor(y_test.values)

# Create DataLoader for training and testing data
train_dataset = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], y_train_tensor)
test_dataset = TensorDataset(X_test_tokens['input_ids'], X_test_tokens['attention_mask'], y_test_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Set up training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

# Calculate accuracy
accuracy = accuracy_score(y_test, all_preds)
print(f'Test Accuracy: {accuracy}')


Downloading vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading (â€¦)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
