In [1]:
import os
import json
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer
)
from torch.utils.data import Dataset
import multimolecule
# --- IMPORT MODULE C·ª¶A B·∫†N ---
from metrics import compute_metrics

# --- C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N (B·∫°n s·ª≠a ·ªü ƒë√¢y) ---
BASE_PATH = r"D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT"
DATA_FOLDER = r"D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\data"      # Folder ch·ª©a data input
RESULT_FOLDER = r"D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\result"                      # Folder l∆∞u k·∫øt qu·∫£ JSON
OUTPUT_MODEL_DIR = r"D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\pretrained_model"             # Folder ch·ª©a c√°c model .h5

# S·ª≠a d√≤ng n√†y:
BASE_MODEL_NAME = "multimolecule/splicebert-human.510"

# Training Config
BATCH_SIZE = 16       
GRAD_ACCUMULATION = 1 
EPOCHS = 3           
LEARNING_RATE = 2e-5

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Training on: {DEVICE}")

# Class h·ªó tr·ª£ l∆∞u JSON kh√¥ng b·ªã l·ªói Numpy
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer): return int(obj)
        if isinstance(obj, np.floating): return float(obj)
        if isinstance(obj, np.ndarray): return obj.tolist()
        return super(NpEncoder, self).default(obj)

üöÄ Training on: cuda


In [2]:
class SpliceTrainDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def prepare_dataset(file_path, tokenizer):
    """ƒê·ªçc file CSV, chia train/val v√† tokenize"""
    df = pd.read_csv(file_path)
    sequences = df['sequence'].tolist()
    labels = df['Splicing_types'].tolist()
    
    # Chia 80% Train - 20% Val
    train_seqs, val_seqs, train_labels, val_labels = train_test_split(
        sequences, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    train_encodings = tokenizer(train_seqs, truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_seqs, truncation=True, padding=True, max_length=512)
    
    return SpliceTrainDataset(train_encodings, train_labels), SpliceTrainDataset(val_encodings, val_labels)

In [3]:
def compute_metrics_wrapper(eval_pred):
    """
    Chuy·ªÉn ƒë·ªïi output c·ªßa Trainer (Logits) th√†nh input cho metrics.py (Probs)
    """
    logits, labels = eval_pred
    
    # Trainer tr·∫£ v·ªÅ numpy array, c·∫ßn d√πng softmax ƒë·ªÉ ra x√°c su·∫•t
    # L∆∞u √Ω: logits c√≥ th·ªÉ l√† tuple, l·∫•y ph·∫ßn t·ª≠ ƒë·∫ßu ti√™n n·∫øu c·∫ßn
    if isinstance(logits, tuple):
        logits = logits[0]
        
    # Chuy·ªÉn logits -> probabilities (Softmax)
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    preds = np.argmax(probs, axis=-1)
    
    # G·ªçi h√†m t·ª´ file metrics.py c·ªßa b·∫°n
    # H√†m n√†y tr·∫£ v·ªÅ dict {'accuracy':..., 'f1':..., 'auc':...}
    return compute_metrics(labels, preds, probs)

In [4]:
def run_training(data_filename, output_subfolder_name):
    print(f"\n{'='*20} TRAINING: {output_subfolder_name} {'='*20}")
    
    input_file = os.path.join(DATA_FOLDER, data_filename)
    save_model_path = os.path.join(OUTPUT_MODEL_DIR, output_subfolder_name)
    
    if not os.path.exists(input_file):
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y data: {input_file}")
        return

    # 1. Load Tokenizer & Model
    print(f"‚è≥ Loading {BASE_MODEL_NAME}...")
    try:
        # Load Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
        
        # Load Model
        model = AutoModelForSequenceClassification.from_pretrained(
            BASE_MODEL_NAME, 
            num_labels=3,
            trust_remote_code=True
        )
        model.to(DEVICE)
    except Exception as e:
        print(f"‚ùå L·ªói t·∫£i model: {e}")
        return
    
    # 2. Prepare Data
    print("‚è≥ ƒêang chu·∫©n b·ªã d·ªØ li·ªáu...")
    try:
        train_dataset, val_dataset = prepare_dataset(input_file, tokenizer)
    except Exception as e:
        print(f"‚ùå L·ªói x·ª≠ l√Ω d·ªØ li·ªáu: {e}")
        return
    
    # 3. Setup Trainer
    # --- S·ª¨A L·ªñI T·∫†I ƒê√ÇY: Thay evaluation_strategy b·∫±ng eval_strategy ---
    training_args = TrainingArguments(
        output_dir=f"./checkpoints/{output_subfolder_name}",
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUMULATION,
        fp16=False,               
        bf16=True,                  
        tf32=True,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=50,
        eval_strategy="epoch",        # <--- D√íNG ƒê√É S·ª¨A (Phi√™n b·∫£n m·ªõi d√πng eval_strategy)
        save_strategy="epoch",        
        load_best_model_at_end=True,  
        metric_for_best_model="eval_loss",
        learning_rate=LEARNING_RATE,
        dataloader_num_workers=0 
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_wrapper, 
    )
    
    # 4. Start Training
    print("üöÄ B·∫Øt ƒë·∫ßu Train...")
    trainer.train()
    
    # 5. L∆∞u Model Final
    print(f"üíæ ƒêang l∆∞u model v√†o: {save_model_path}")
    model.save_pretrained(save_model_path)
    tokenizer.save_pretrained(save_model_path)
    
    # 6. ƒê√°nh gi√° & Ghi JSON
    print("üìä ƒêang t√≠nh to√°n metrics cu·ªëi c√πng...")
    final_metrics = trainer.evaluate()
    
    output_json_path = os.path.join(RESULT_FOLDER, f"train_result_{output_subfolder_name}.json")
    
    final_output = {
        "meta_data": {
            "task": "training_validation",
            "model_name": output_subfolder_name,
            "source_data": data_filename,
            "base_model": BASE_MODEL_NAME
        },
        "metrics": final_metrics
    }
    
    os.makedirs(RESULT_FOLDER, exist_ok=True)
    with open(output_json_path, 'w') as f:
        json.dump(final_output, f, cls=NpEncoder, indent=4)
        
    print(f"‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ metrics v√†o: {output_json_path}")

In [5]:
if __name__ == "__main__":
    # Danh s√°ch c√°c c·∫∑p (File Data, T√™n Model t∆∞∆°ng ·ª©ng trong folder pretrained_model)
    # V√≠ d·ª•: Data test t·ªâ l·ªá 1:1:1 ch·∫°y v·ªõi model train tr√™n t·ªâ l·ªá 1:1:1
    
    tasks = [
        # ("T√™n_file_data.csv", "T√™n_folder_ho·∫∑c_file_model")
        ("test_1_1_1.csv", "SpliceBERT_ratio_1_1_1"), 
        ("test_2_1_1.csv", "SpliceBERT_ratio_2_1_1"),
        ("test_4_1_1.csv", "SpliceBERT_ratio_4_1_1"),
        ("test_10_1_1.csv", "SpliceBERT_ratio_10_1_1"),
        ("test_data.csv", "SpliceBERT_ratio_100_1_1")
    ]

    print("üöÄ B·∫ÆT ƒê·∫¶U QU√Å TR√åNH KI·ªÇM TH·ª¨...")
    
    for data_file, model_name in tasks:
        run_training(data_file, model_name)
        
    print("\nüèÅ HO√ÄN T·∫§T TO√ÄN B·ªò!")

üöÄ B·∫ÆT ƒê·∫¶U QU√Å TR√åNH KI·ªÇM TH·ª¨...

‚è≥ Loading multimolecule/splicebert-human.510...


Loading weights:   0%|          | 0/101 [00:00<?, ?it/s]

[1mSpliceBertForSequencePrediction LOAD REPORT[0m from: multimolecule/splicebert-human.510
Key                                 | Status     | 
------------------------------------+------------+-
lm_head.transform.dense.weight      | UNEXPECTED | 
lm_head.transform.layer_norm.weight | UNEXPECTED | 
lm_head.decoder.weight              | UNEXPECTED | 
lm_head.transform.dense.bias        | UNEXPECTED | 
lm_head.transform.layer_norm.bias   | UNEXPECTED | 
lm_head.bias                        | UNEXPECTED | 
sequence_head.decoder.weight        | MISSING    | 
model.pooler.dense.bias             | MISSING    | 
model.pooler.dense.weight           | MISSING    | 
sequence_head.decoder.bias          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


‚è≥ ƒêang chu·∫©n b·ªã d·ªØ li·ªáu...


`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


üöÄ B·∫Øt ƒë·∫ßu Train...


This can lead to unexpected behavior. Please set `problem_type` explicitly.
  warn(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
import os
import json
import time
import pandas as pd
import torch
import numpy as np
import multimolecule  # Import ƒë·ªÉ nh·∫≠n di·ªán SpliceBERT
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix

# --- Import file metrics c·ªßa b·∫°n ---
# ƒê·∫£m b·∫£o file metrics.py n·∫±m c√πng th∆∞ m·ª•c v·ªõi file notebook n√†y
from metrics import compute_metrics

# ==============================================================================
# 1. C·∫§U H√åNH (B·∫†N CH·ªà C·∫¶N S·ª¨A ·ªû ƒê√ÇY)
# ==============================================================================

# ƒê∆∞·ªùng d·∫´n folder ch·ª©a c√°c file CSV ƒë√£ x·ª≠ l√Ω (ƒë√£ crop 510bp)
DATA_FOLDER = r"D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\data"

# ƒê∆∞·ªùng d·∫´n folder ƒë·ªÉ l∆∞u k·∫øt qu·∫£ JSON
RESULT_FOLDER = r"D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\result"

# ƒê∆∞·ªùng d·∫´n Model b·∫°n mu·ªën test
# - N·∫øu l√† model tr√™n m·∫°ng: ƒë·ªÉ t√™n (vd: "multimolecule/splicebert-human.510")
# - N·∫øu l√† model trong m√°y: ƒëi·ªÅn ƒë∆∞·ªùng d·∫´n folder (vd: r"D:\Study\...\pretrained_model\SpliceBERT_ratio_1_1_1")
MODEL_PATH = "multimolecule/splicebert-human.510"  

BIAS_ADJUSTMENT = None

# C·∫•u h√¨nh ph·∫ßn c·ª©ng cho RTX 5080
BATCH_SIZE = 256        # Inference r·∫•t nh·∫π, 5080 c√≥ th·ªÉ g√°nh 256-512 m·∫´u/l·∫ßn
USE_BF16 = True         # B·∫≠t BFloat16 (Nhanh & Chu·∫©n tr√™n RTX 50-series)
COMPILE_MODEL = False    # B·∫≠t torch.compile (TƒÉng t·ªëc c·ª±c m·∫°nh tr√™n PyTorch 2.0+)

# ==============================================================================

# Setup thi·∫øt b·ªã
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Hardware: {torch.cuda.get_device_name(0)} | CUDA: {torch.version.cuda}")

# Class Dataset (Gi·∫£n l∆∞·ª£c cho Inference)
class SpliceInferenceDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_len=512):
        self.df = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Gi·∫£ ƒë·ªãnh data ƒë√£ crop chu·∫©n 510bp
        seq = str(self.df.iloc[idx]['sequence'])
        label = int(self.df.iloc[idx]['Splicing_types'])
        
        encoding = self.tokenizer(
            seq,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Class h·ªó tr·ª£ l∆∞u JSON
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer): return int(obj)
        if isinstance(obj, np.floating): return float(obj)
        if isinstance(obj, np.ndarray): return obj.tolist()
        return super(NpEncoder, self).default(obj)

def run_inference(data_filename):
    print(f"\n{'='*20} PROCESSING: {data_filename} {'='*20}")
    
    file_path = os.path.join(DATA_FOLDER, data_filename)
    if not os.path.exists(file_path):
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y file: {file_path}")
        return

    # 1. Load Model & Tokenizer
    print(f"‚è≥ Loading Model: {MODEL_PATH}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_PATH, 
            num_labels=3,
            trust_remote_code=True
        )
        model.to(DEVICE)
        model.eval() # Ch·∫ø ƒë·ªô ƒë√°nh gi√° (kh√¥ng train)
        
        # T·ªëi ∆∞u h√≥a Model (Optional)
        if COMPILE_MODEL:
            print("üöÄ Compiling model for RTX 5080...")
            try:
                model = torch.compile(model)
            except Exception as e:
                print(f"‚ö†Ô∏è Compile th·∫•t b·∫°i (v·∫´n ch·∫°y b√¨nh th∆∞·ªùng): {e}")
                
    except Exception as e:
        print(f"‚ùå L·ªói load model: {e}")
        return

    # 2. Prepare Data
    dataset = SpliceInferenceDataset(file_path, tokenizer)
    # num_workers=0 ƒë·ªÉ tr√°nh l·ªói tr√™n Windows
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

    # 3. Inference Loop
    all_preds = []
    all_labels = []
    all_probs = []
    
    print(f"üöÄ ƒêang ch·∫°y d·ª± ƒëo√°n tr√™n {len(dataset)} m·∫´u...")
    start_time = time.time()
    
    # D√πng torch.autocast ƒë·ªÉ ch·∫°y BF16 (nhanh h∆°n FP32)
    dtype = torch.bfloat16 if USE_BF16 and torch.cuda.is_bf16_supported() else torch.float32
    
    with torch.no_grad(): # T·∫Øt t√≠nh to√°n ƒë·∫°o h√†m (Ti·∫øt ki·ªám VRAM)
        for batch in tqdm(loader):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            with torch.autocast(device_type="cuda", dtype=dtype):
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
            
            probs = torch.softmax(logits, dim=1)
            preds = torch.argmax(probs, dim=1)

            all_preds.extend(preds.cpu().float().numpy())
            all_labels.extend(labels.cpu().float().numpy())
            all_probs.extend(probs.cpu().float().numpy())

    inference_time = time.time() - start_time
    print(f"‚è±Ô∏è Th·ªùi gian: {inference_time:.2f}s ({len(dataset)/inference_time:.0f} seq/s)")

# 4. T√≠nh Metrics & Confusion Matrix
    print("üìä ƒêang t√≠nh metrics...")
    try:
        # T√≠nh c√°c ch·ªâ s·ªë c∆° b·∫£n (Accuracy, F1, AUC...)
        metric_results = compute_metrics(all_labels, all_preds, probs=np.array(all_probs))
        
        # T√≠nh Confusion Matrix
        cm = confusion_matrix(all_labels, all_preds)
        
        # X·ª≠ l√Ω Ratio t·ª´ t√™n file (V√≠ d·ª•: test_1_1_1.csv -> 1-1-1)
        # Logic: B·ªè ƒëu√¥i .csv, b·ªè ti·ªÅn t·ªë "test_", thay _ b·∫±ng -
        ratio_str = data_filename.replace(".csv", "").replace("test_", "").replace("_", "-")
        
        # T·∫°o c·∫•u tr√∫c JSON ph·∫≥ng nh∆∞ y√™u c·∫ßu
        final_output = metric_results.copy() # Copy c√°c metrics v√†o level ngo√†i c√πng
        
        final_output["confusion_matrix"] = cm.tolist() # Th√™m ma tr·∫≠n nh·∫ßm l·∫´n
        final_output["meta"] = {
            "bias_applied": BIAS_ADJUSTMENT if BIAS_ADJUSTMENT else [0.0, 0.0, 0.0],
            "ratio": ratio_str
        }
        
    except Exception as e:
        print(f"‚ö†Ô∏è L·ªói t√≠nh metrics: {e}")
        return

    # 5. L∆∞u k·∫øt qu·∫£
    model_name_clean = os.path.basename(MODEL_PATH).replace(":", "_").replace("/", "_")
    output_filename = f"result_{data_filename.replace('.csv', '')}_on_{model_name_clean}.json"
    output_path = os.path.join(RESULT_FOLDER, output_filename)
    
    os.makedirs(RESULT_FOLDER, exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(final_output, f, cls=NpEncoder, indent=4)
        
    print(f"‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ chu·∫©n format t·∫°i: {output_path}")
    print(f"   Accuracy: {final_output.get('accuracy', 0):.4f}")

# ==============================================================================
# CH·∫†Y CH∆Ø∆†NG TR√åNH
# ==============================================================================
if __name__ == "__main__":
    # Danh s√°ch c√°c file c·∫ßn ch·∫°y
    files_to_test = [
        "test_1_1_1.csv", 
        "test_2_1_1.csv",
        "test_4_1_1.csv",
        "test_10_1_1.csv",
        "test_data.csv" # File m·ªõi c·ªßa b·∫°n
    ]
    
    print("üé¨ B·∫ÆT ƒê·∫¶U CH·∫†Y INFERENCE H√ÄNG LO·∫†T...")
    for f in files_to_test:
        run_inference(f)
    print("\nüèÅ HO√ÄN T·∫§T!")

üöÄ Hardware: NVIDIA GeForce RTX 5080 | CUDA: 12.8
üé¨ B·∫ÆT ƒê·∫¶U CH·∫†Y INFERENCE H√ÄNG LO·∫†T...

‚è≥ Loading Model: multimolecule/splicebert-human.510


Loading weights:   0%|          | 0/101 [00:00<?, ?it/s]

[1mSpliceBertForSequencePrediction LOAD REPORT[0m from: multimolecule/splicebert-human.510
Key                                 | Status     | 
------------------------------------+------------+-
lm_head.transform.layer_norm.bias   | UNEXPECTED | 
lm_head.transform.layer_norm.weight | UNEXPECTED | 
lm_head.decoder.weight              | UNEXPECTED | 
lm_head.bias                        | UNEXPECTED | 
lm_head.transform.dense.bias        | UNEXPECTED | 
lm_head.transform.dense.weight      | UNEXPECTED | 
model.pooler.dense.weight           | MISSING    | 
model.pooler.dense.bias             | MISSING    | 
sequence_head.decoder.weight        | MISSING    | 
sequence_head.decoder.bias          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


üöÄ ƒêang ch·∫°y d·ª± ƒëo√°n tr√™n 26310 m·∫´u...


  0%|          | 0/103 [00:00<?, ?it/s]

‚è±Ô∏è Th·ªùi gian: 89.62s (294 seq/s)
üìä ƒêang t√≠nh metrics...
‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ chu·∫©n format t·∫°i: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\result\result_test_1_1_1_on_splicebert-human.510.json
   Accuracy: 0.3423

‚è≥ Loading Model: multimolecule/splicebert-human.510


Loading weights:   0%|          | 0/101 [00:00<?, ?it/s]

[1mSpliceBertForSequencePrediction LOAD REPORT[0m from: multimolecule/splicebert-human.510
Key                                 | Status     | 
------------------------------------+------------+-
lm_head.transform.layer_norm.bias   | UNEXPECTED | 
lm_head.transform.layer_norm.weight | UNEXPECTED | 
lm_head.decoder.weight              | UNEXPECTED | 
lm_head.bias                        | UNEXPECTED | 
lm_head.transform.dense.bias        | UNEXPECTED | 
lm_head.transform.dense.weight      | UNEXPECTED | 
model.pooler.dense.weight           | MISSING    | 
model.pooler.dense.bias             | MISSING    | 
sequence_head.decoder.weight        | MISSING    | 
sequence_head.decoder.bias          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


üöÄ ƒêang ch·∫°y d·ª± ƒëo√°n tr√™n 35132 m·∫´u...


  0%|          | 0/138 [00:00<?, ?it/s]

‚è±Ô∏è Th·ªùi gian: 113.20s (310 seq/s)
üìä ƒêang t√≠nh metrics...
‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ chu·∫©n format t·∫°i: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\result\result_test_2_1_1_on_splicebert-human.510.json
   Accuracy: 0.3165

‚è≥ Loading Model: multimolecule/splicebert-human.510


Loading weights:   0%|          | 0/101 [00:00<?, ?it/s]

[1mSpliceBertForSequencePrediction LOAD REPORT[0m from: multimolecule/splicebert-human.510
Key                                 | Status     | 
------------------------------------+------------+-
lm_head.transform.layer_norm.bias   | UNEXPECTED | 
lm_head.transform.layer_norm.weight | UNEXPECTED | 
lm_head.decoder.weight              | UNEXPECTED | 
lm_head.bias                        | UNEXPECTED | 
lm_head.transform.dense.bias        | UNEXPECTED | 
lm_head.transform.dense.weight      | UNEXPECTED | 
model.pooler.dense.weight           | MISSING    | 
model.pooler.dense.bias             | MISSING    | 
sequence_head.decoder.weight        | MISSING    | 
sequence_head.decoder.bias          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


üöÄ ƒêang ch·∫°y d·ª± ƒëo√°n tr√™n 52776 m·∫´u...


  0%|          | 0/207 [00:00<?, ?it/s]

‚è±Ô∏è Th·ªùi gian: 165.15s (320 seq/s)
üìä ƒêang t√≠nh metrics...
‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ chu·∫©n format t·∫°i: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\result\result_test_4_1_1_on_splicebert-human.510.json
   Accuracy: 0.2049

‚è≥ Loading Model: multimolecule/splicebert-human.510


Loading weights:   0%|          | 0/101 [00:00<?, ?it/s]

[1mSpliceBertForSequencePrediction LOAD REPORT[0m from: multimolecule/splicebert-human.510
Key                                 | Status     | 
------------------------------------+------------+-
lm_head.transform.layer_norm.bias   | UNEXPECTED | 
lm_head.transform.layer_norm.weight | UNEXPECTED | 
lm_head.decoder.weight              | UNEXPECTED | 
lm_head.bias                        | UNEXPECTED | 
lm_head.transform.dense.bias        | UNEXPECTED | 
lm_head.transform.dense.weight      | UNEXPECTED | 
model.pooler.dense.weight           | MISSING    | 
model.pooler.dense.bias             | MISSING    | 
sequence_head.decoder.weight        | MISSING    | 
sequence_head.decoder.bias          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


üöÄ ƒêang ch·∫°y d·ª± ƒëo√°n tr√™n 105708 m·∫´u...


  0%|          | 0/413 [00:00<?, ?it/s]

‚è±Ô∏è Th·ªùi gian: 339.60s (311 seq/s)
üìä ƒêang t√≠nh metrics...
‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ chu·∫©n format t·∫°i: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\result\result_test_10_1_1_on_splicebert-human.510.json
   Accuracy: 0.1552

‚è≥ Loading Model: multimolecule/splicebert-human.510


Loading weights:   0%|          | 0/101 [00:00<?, ?it/s]

[1mSpliceBertForSequencePrediction LOAD REPORT[0m from: multimolecule/splicebert-human.510
Key                                 | Status     | 
------------------------------------+------------+-
lm_head.transform.layer_norm.bias   | UNEXPECTED | 
lm_head.transform.layer_norm.weight | UNEXPECTED | 
lm_head.decoder.weight              | UNEXPECTED | 
lm_head.bias                        | UNEXPECTED | 
lm_head.transform.dense.bias        | UNEXPECTED | 
lm_head.transform.dense.weight      | UNEXPECTED | 
model.pooler.dense.weight           | MISSING    | 
model.pooler.dense.bias             | MISSING    | 
sequence_head.decoder.weight        | MISSING    | 
sequence_head.decoder.bias          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


üöÄ ƒêang ch·∫°y d·ª± ƒëo√°n tr√™n 938297 m·∫´u...


  0%|          | 0/3666 [00:00<?, ?it/s]

‚è±Ô∏è Th·ªùi gian: 3047.05s (308 seq/s)
üìä ƒêang t√≠nh metrics...
‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ chu·∫©n format t·∫°i: D:\Bio_sequence_Research_AITALAB\benchmark\task1_splicing_prediction\SpliceBERT\result\result_test_data_on_splicebert-human.510.json
   Accuracy: 0.0120

üèÅ HO√ÄN T·∫§T!
