# DNABERT 모델 불러오기
### (모델 사용 환경 설정 포함)

In [1]:
import torch
print(torch.cuda.is_available())  # True가 출력되면 GPU 사용 가능

True


In [2]:
# github 환경 가져오기
!git clone https://github.com/MAGICS-LAB/DNABERT_2.git
%cd DNABERT_2

Cloning into 'DNABERT_2'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 98 (delta 18), reused 24 (delta 15), pack-reused 64 (from 1)[K
Receiving objects: 100% (98/98), 870.88 KiB | 29.03 MiB/s, done.
Resolving deltas: 100% (40/40), done.
/content/DNABERT_2


In [None]:
# DNABERT-2 실행에 필요한 모든 패키지를 포함 (필수)

!python3 -m pip install -r requirements.txt

In [None]:
# 설치된 transformer 버전 확인
import transformers
print(transformers.__version__)

4.29.2


In [None]:
!pip list | grep transformers
!pip list | grep torch

# 충돌 에러가 떴지만 주요 패키지가 제대로 설치되어 있으므로 넘어갔다.

sentence-transformers              3.2.1
transformers                       4.29.2
torch                              2.5.1+cu121
torchaudio                         2.5.1+cu121
torchsummary                       1.5.1
torchvision                        0.20.1+cu121


In [None]:
# DNABERT-2 모델 및 토크나이저 로드
# transformer 버전 4.28 초과 시 실행하는 코드
from transformers.models.bert.configuration_bert import BertConfig
from transformers import AutoTokenizer, AutoModel

# DNABERT-2 설정 로드
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")

# DNABERT-2 모델 및 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)

In [None]:
# 예시 (가져온 tokenizer와 model이 제대로 실행되는지 확인)
# DNA 시퀀스 임베딩 계산

import torch

# 예제 DNA 시퀀스
dna = "ACGTAGCATCGACTATCATCAGACACTGGTATTGACGTACGAGCATCTGTTAGC"

# 토큰화
inputs = tokenizer(dna, return_tensors="pt")["input_ids"]

# 모델 실행
hidden_states = model(inputs)[0]  # [batch_size, sequence_length, hidden_size]

# Mean Pooling (평균 풀링)
embedding_mean = torch.mean(hidden_states[0], dim=0)
print("Mean Pooling Shape:", embedding_mean.shape)  # (768,)

# Max Pooling (최대값 풀링)
embedding_max = torch.max(hidden_states[0], dim=0)[0]
print("Max Pooling Shape:", embedding_max.shape)  # (768,)

Mean Pooling Shape: torch.Size([768])
Max Pooling Shape: torch.Size([768])


# Fine-tuning

In [5]:
%cd finetune

/content/DNABERT_2/finetune


In [6]:
# Colab용 환경 설정
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls "./INPUT"  # 설정한 input 데이터 경로에 train, dev, test 모두 존재하는지 확인

dev.csv  test.csv  train.csv


In [7]:
# 설정
DATA_PATH = "'./INPUT'"
OUTPUT_DIR = "'./OUTPUT'" # checkpoint 모델 저장될 경로
MAX_LENGTH = 102  # 최대 DNA 서열 길이 * 0.25
LR = 3e-5  # 학습률 설정 

### 5 epoch 학습

In [None]:
# Training with DataParallel
!DISABLE_TQDM=1 python train.py \
    --model_name_or_path zhihan1996/DNABERT-2-117M \
    --data_path $DATA_PATH \
    --kmer -1 \
    --run_name DNABERT2_$DATA_PATH \
    --model_max_length $MAX_LENGTH \
    --per_device_train_batch_size 64 \
    --per_device_eval_batch_size 128 \
    --gradient_accumulation_steps 2 \
    --learning_rate $LR \
    --num_train_epochs 5 \
    --fp16 \
    --save_steps 250 \
    --output_dir $OUTPUT_DIR \
    --evaluation_strategy steps \
    --eval_steps 250 \
    --warmup_steps 50 \
    --logging_steps 500 \
    --overwrite_output_dir True \
    --log_level warning \
    --find_unused_parameters False


### Fine-tuning 관련 자세한 사항은 DNABERT 깃허브 참조