# Fine-tuning된 DNABERT 모델의 feature vector

## 기본 환경 로드

In [None]:
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
import torch

In [None]:
# Colab용 환경 설정
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Config 명시
from transformers.models.bert.configuration_bert import BertConfig

# DNABERT-2 설정 로드
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")

## 가장 우수한 성능을 가진 DNABERT 로드

In [None]:
# 공통 경로 설정
base_path = './MODEL' #사용자 지정 경로

# 모델과 토크나이저 로드
model_path = f"{base_path}/OUTPUT/checkpoint" # 가장 성능 좋은 모델 정보 불러오기
model = AutoModel.from_pretrained(model_path, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# Train, Dev, Test 데이터를 위한 경로 설정 및 실행 (feature vector 저장을 위한 경로 지정)
datasets = {
    "train": {
        "input_csv": f"{base_path}/INPUT/train.csv",
        "output_features": f"{base_path}/FeatureVector/train_features.npy",
        "output_labels": f"{base_path}/FeatureVector/train_labels.npy",
    },
    "dev": {
        "input_csv": f"{base_path}/INPUT/dev.csv",
        "output_features": f"{base_path}/FeatureVector/dev_features.npy",
        "output_labels": f"{base_path}/FeatureVector/dev_labels.npy",
    },
    "test": {
        "input_csv": f"{base_path}/INPUT/test.csv",
        "output_features": f"{base_path}/FeatureVector/test_features.npy",
        "output_labels": f"{base_path}/FeatureVector/test_labels.npy",
    },
}

## Feature vector 추출

In [None]:
def extract_and_save_features(input_csv, output_features, output_labels, batch_size=16):
    """
    Attention 기반 Feature Vector를 추출하고 저장하는 함수. -> "돌연변이 영역"이 중요하므로!

    Args:
        input_csv (str): 입력 CSV 경로
        output_features (str): Feature Vector 저장 경로
        output_labels (str): Label 저장 경로
        batch_size (int): 배치 크기
    """
    # CSV 파일 로드
    data = pd.read_csv(input_csv)
    sequences = data["sequence"].tolist()
    labels = data["label"].tolist()

    # Feature Vector 추출
    feature_vectors = []
    with torch.no_grad():
        for i in range(0, len(sequences), batch_size):
            batch_sequences = sequences[i:i + batch_size]

            # 토큰화 및 입력 데이터 생성
            inputs = tokenizer(batch_sequences, padding=True, truncation=True, return_tensors="pt").to(device)

            # 모델 출력
            outputs = model(**inputs)

            # Hidden States 및 Attention Score 계산
            hidden_states = outputs[0]  # (batch_size, seq_length, hidden_size)
            attention_scores = hidden_states.mean(dim=-1)  # (batch_size, seq_length) # token 별로 평균값 계산 -> attention score
            attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)  # Normalize to weights

            # Attention 가중 평균 계산
            weighted_features = (hidden_states * attention_weights.unsqueeze(-1)).sum(dim=1).cpu().numpy()
            feature_vectors.append(weighted_features)

    # Feature Vector와 Label 저장
    feature_vectors = np.vstack(feature_vectors)  # (num_samples, hidden_size)
    np.save(output_features, feature_vectors)
    np.save(output_labels, np.array(labels))

    print(f"Feature Vectors saved to {output_features}")
    print(f"Labels saved to {output_labels}")

# Train, Dev, Test 데이터에 대해 Feature Vector 추출 및 저장
for dataset_name, paths in datasets.items():
    print(f"Processing {dataset_name} dataset...")
    extract_and_save_features(
        input_csv=paths["input_csv"],
        output_features=paths["output_features"],
        output_labels=paths["output_labels"],
        batch_size=64  # DNABERT train 시킬 때와 동일하게 맞춤
    )
