In [1]:
import os
import json
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer
)
from torch.utils.data import Dataset
import multimolecule
# --- IMPORT MODULE C·ª¶A B·∫†N ---
from metrics import compute_metrics

# --- C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N (B·∫°n s·ª≠a ·ªü ƒë√¢y) ---
BASE_PATH = r"D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\SpliceBERT"
DATA_FOLDER = r"D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\SpliceBERT\data"      # Folder ch·ª©a data input
RESULT_FOLDER = r"D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\SpliceBERT\results"                      # Folder l∆∞u k·∫øt qu·∫£ JSON
OUTPUT_MODEL_DIR = r"D:\Study\5-FA25\AiTa_Lab_Research\Code\Inference_Model\SpliceBERT\pretrained_model"             # Folder ch·ª©a c√°c model .h5

# S·ª≠a d√≤ng n√†y:
BASE_MODEL_NAME = "multimolecule/splicebert-human.510"

# Training Config
BATCH_SIZE = 8       
GRAD_ACCUMULATION = 2 
EPOCHS = 3           
LEARNING_RATE = 2e-5

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Training on: {DEVICE}")

# Class h·ªó tr·ª£ l∆∞u JSON kh√¥ng b·ªã l·ªói Numpy
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer): return int(obj)
        if isinstance(obj, np.floating): return float(obj)
        if isinstance(obj, np.ndarray): return obj.tolist()
        return super(NpEncoder, self).default(obj)

üöÄ Training on: cuda


In [2]:
class SpliceTrainDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def prepare_dataset(file_path, tokenizer):
    """ƒê·ªçc file CSV, chia train/val v√† tokenize"""
    df = pd.read_csv(file_path)
    sequences = df['sequence'].tolist()
    labels = df['Splicing_types'].tolist()
    
    # Chia 80% Train - 20% Val
    train_seqs, val_seqs, train_labels, val_labels = train_test_split(
        sequences, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    train_encodings = tokenizer(train_seqs, truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_seqs, truncation=True, padding=True, max_length=512)
    
    return SpliceTrainDataset(train_encodings, train_labels), SpliceTrainDataset(val_encodings, val_labels)

In [3]:
def compute_metrics_wrapper(eval_pred):
    """
    Chuy·ªÉn ƒë·ªïi output c·ªßa Trainer (Logits) th√†nh input cho metrics.py (Probs)
    """
    logits, labels = eval_pred
    
    # Trainer tr·∫£ v·ªÅ numpy array, c·∫ßn d√πng softmax ƒë·ªÉ ra x√°c su·∫•t
    # L∆∞u √Ω: logits c√≥ th·ªÉ l√† tuple, l·∫•y ph·∫ßn t·ª≠ ƒë·∫ßu ti√™n n·∫øu c·∫ßn
    if isinstance(logits, tuple):
        logits = logits[0]
        
    # Chuy·ªÉn logits -> probabilities (Softmax)
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    preds = np.argmax(probs, axis=-1)
    
    # G·ªçi h√†m t·ª´ file metrics.py c·ªßa b·∫°n
    # H√†m n√†y tr·∫£ v·ªÅ dict {'accuracy':..., 'f1':..., 'auc':...}
    return compute_metrics(labels, preds, probs)

In [4]:
def run_training(data_filename, output_subfolder_name):
    print(f"\n{'='*20} TRAINING: {output_subfolder_name} {'='*20}")
    
    input_file = os.path.join(DATA_FOLDER, data_filename)
    save_model_path = os.path.join(OUTPUT_MODEL_DIR, output_subfolder_name)
    
    if not os.path.exists(input_file):
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y data: {input_file}")
        return

    # 1. Load Tokenizer & Model
    print(f"‚è≥ Loading {BASE_MODEL_NAME}...")
    try:
        # Load Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
        
        # Load Model
        model = AutoModelForSequenceClassification.from_pretrained(
            BASE_MODEL_NAME, 
            num_labels=3,
            trust_remote_code=True
        )
        model.to(DEVICE)
    except Exception as e:
        print(f"‚ùå L·ªói t·∫£i model: {e}")
        return
    
    # 2. Prepare Data
    print("‚è≥ ƒêang chu·∫©n b·ªã d·ªØ li·ªáu...")
    try:
        train_dataset, val_dataset = prepare_dataset(input_file, tokenizer)
    except Exception as e:
        print(f"‚ùå L·ªói x·ª≠ l√Ω d·ªØ li·ªáu: {e}")
        return
    
    # 3. Setup Trainer
    # --- S·ª¨A L·ªñI T·∫†I ƒê√ÇY: Thay evaluation_strategy b·∫±ng eval_strategy ---
    training_args = TrainingArguments(
        output_dir=f"./checkpoints/{output_subfolder_name}",
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUMULATION,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=50,
        eval_strategy="epoch",        # <--- D√íNG ƒê√É S·ª¨A (Phi√™n b·∫£n m·ªõi d√πng eval_strategy)
        save_strategy="epoch",        
        load_best_model_at_end=True,  
        metric_for_best_model="eval_loss",
        learning_rate=LEARNING_RATE,
        fp16=torch.cuda.is_available(), 
        dataloader_num_workers=0 
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_wrapper, 
    )
    
    # 4. Start Training
    print("üöÄ B·∫Øt ƒë·∫ßu Train...")
    trainer.train()
    
    # 5. L∆∞u Model Final
    print(f"üíæ ƒêang l∆∞u model v√†o: {save_model_path}")
    model.save_pretrained(save_model_path)
    tokenizer.save_pretrained(save_model_path)
    
    # 6. ƒê√°nh gi√° & Ghi JSON
    print("üìä ƒêang t√≠nh to√°n metrics cu·ªëi c√πng...")
    final_metrics = trainer.evaluate()
    
    output_json_path = os.path.join(RESULT_FOLDER, f"train_result_{output_subfolder_name}.json")
    
    final_output = {
        "meta_data": {
            "task": "training_validation",
            "model_name": output_subfolder_name,
            "source_data": data_filename,
            "base_model": BASE_MODEL_NAME
        },
        "metrics": final_metrics
    }
    
    os.makedirs(RESULT_FOLDER, exist_ok=True)
    with open(output_json_path, 'w') as f:
        json.dump(final_output, f, cls=NpEncoder, indent=4)
        
    print(f"‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ metrics v√†o: {output_json_path}")

In [5]:
if __name__ == "__main__":
    # Danh s√°ch c√°c c·∫∑p (File Data, T√™n Model t∆∞∆°ng ·ª©ng trong folder pretrained_model)
    # V√≠ d·ª•: Data test t·ªâ l·ªá 1:1:1 ch·∫°y v·ªõi model train tr√™n t·ªâ l·ªá 1:1:1
    
    tasks = [
        # ("T√™n_file_data.csv", "T√™n_folder_ho·∫∑c_file_model")
        ("test_1_1_1.csv", "SpliceBERT_ratio_1_1_1"), 
        ("test_2_1_1.csv", "SpliceBERT_ratio_2_1_1"),
        ("test_4_1_1.csv", "SpliceBERT_ratio_4_1_1"),
        ("test_10_1_1.csv", "SpliceBERT_ratio_10_1_1"),
    ]

    print("üöÄ B·∫ÆT ƒê·∫¶U QU√Å TR√åNH KI·ªÇM TH·ª¨...")
    
    for data_file, model_name in tasks:
        run_training(data_file, model_name)
        
    print("\nüèÅ HO√ÄN T·∫§T TO√ÄN B·ªò!")

üöÄ B·∫ÆT ƒê·∫¶U QU√Å TR√åNH KI·ªÇM TH·ª¨...

‚è≥ Loading multimolecule/splicebert-human.510...


Loading weights:   0%|          | 0/101 [00:00<?, ?it/s]

[1mSpliceBertForSequencePrediction LOAD REPORT[0m from: multimolecule/splicebert-human.510
Key                                 | Status     | 
------------------------------------+------------+-
lm_head.bias                        | UNEXPECTED | 
lm_head.transform.layer_norm.weight | UNEXPECTED | 
lm_head.transform.dense.weight      | UNEXPECTED | 
lm_head.transform.layer_norm.bias   | UNEXPECTED | 
lm_head.decoder.weight              | UNEXPECTED | 
lm_head.transform.dense.bias        | UNEXPECTED | 
model.pooler.dense.bias             | MISSING    | 
sequence_head.decoder.weight        | MISSING    | 
sequence_head.decoder.bias          | MISSING    | 
model.pooler.dense.weight           | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


‚è≥ ƒêang chu·∫©n b·ªã d·ªØ li·ªáu...


`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


üöÄ B·∫Øt ƒë·∫ßu Train...


This can lead to unexpected behavior. Please set `problem_type` explicitly.
  warn(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 