In [None]:
# get_epop_test_predictions

import pandas as pd
import numpy as np
import pickle
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from tqdm.auto import tqdm
import shutil
import os

from google.colab import drive
drive.mount('/content/drive')

PROJECT_ROOT = '/content/drive/MyDrive/protein_classification'
DATA_DIR = f'{PROJECT_ROOT}/data'
MODELS_DIR = f'{PROJECT_ROOT}/models'
RESULTS_DIR = f'{PROJECT_ROOT}/results'

for dir_path in [PROJECT_ROOT, DATA_DIR, MODELS_DIR, RESULTS_DIR]:
    os.makedirs(dir_path, exist_ok=True)

os.chdir(PROJECT_ROOT)
print(f"Working directory: {os.getcwd()}")

splits_path = f'{RESULTS_DIR}/data_splits.pkl'
print(f"Loading data splits from: {splits_path}")

try:
    with open(splits_path, 'rb') as f:
        data_splits = pickle.load(f)

    test_seq = data_splits['test_seq']
    test_labels = data_splits['test_labels']
    print(f"Test set loaded with {len(test_seq)} samples.")

except FileNotFoundError:
    print(f"Error: data_splits.pkl not found at {splits_path}.")
    raise Exception("请先运行01_data_preparation.py")
except Exception as e:
    print(f"Error loading data splits: {e}")
    raise

model_path = f"{MODELS_DIR}/esm2_ecm_model_enhanced"
print(f"Loading EPOP model from: {model_path}")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    torch.set_grad_enabled(False)

    print(f"Model loaded successfully on {device}.")
    print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")

except Exception as e:
    print(f"Error loading EPOP model: {e}")
    raise Exception("make sure 02_esm2_training.py is ready")


class ProteinDataset(torch.utils.data.Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length=1024):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = str(self.sequences[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            sequence,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


test_dataset = ProteinDataset(
    test_seq,
    test_labels,
    tokenizer,
    max_length=1024
)

print(f"Test dataset prepared with {len(test_dataset)} samples.")




print("\nRunning predictions on the test set...")


prediction_args = TrainingArguments(
    output_dir=f'{RESULTS_DIR}/temp_prediction_output',
    per_device_eval_batch_size=32,
    dataloader_num_workers=2,
    fp16=torch.cuda.is_available(),
    report_to="none",
    logging_steps=50,
)


prediction_trainer = Trainer(
    model=model,
    args=prediction_args,
)


print("Starting prediction...")
predictions_output = prediction_trainer.predict(test_dataset)


test_logits = predictions_output.predictions
test_labels_np = predictions_output.label_ids

print(f"Predictions completed. Shape: {test_logits.shape}")


test_probs_positive_class = torch.softmax(torch.from_numpy(test_logits), dim=-1)[:, 1].cpu().numpy()


predictions_data = {
    'test_labels': test_labels_np.tolist(),
    'test_probs': test_probs_positive_class.tolist()
}

predictions_path = f'{RESULTS_DIR}/epop_test_predictions_for_curves.pkl'
with open(predictions_path, 'wb') as f:
    pickle.dump(predictions_data, f)

print(f"\n✓ Successfully saved EPOP test predictions to: {predictions_path}")


if os.path.exists(prediction_args.output_dir):
    shutil.rmtree(prediction_args.output_dir)
    print(f"✓ Cleaned up temporary directory")


Mounted at /content/drive
Working directory: /content/drive/MyDrive/protein_classification
Loading data splits from: /content/drive/MyDrive/protein_classification/results/data_splits.pkl
Test set loaded with 16000 samples.
Loading EPOP model from: /content/drive/MyDrive/protein_classification/models/esm2_ecm_model_enhanced
Model loaded successfully on cuda.
Model has 651043223 parameters
Test dataset prepared with 16000 samples.

Running predictions on the test set...
Starting prediction...


Predictions completed. Shape: (16000, 2)

✓ Successfully saved EPOP test predictions to: /content/drive/MyDrive/protein_classification/results/epop_test_predictions_for_curves.pkl
✓ Cleaned up temporary directory
