In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import numpy as np
# Import the tokenizer and the model
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", trust_remote_code=True).to(device)

# Choose the length to which the input sequences are padded. By default, the 
# model max length is chosen, but feel free to decrease it as the time taken to 
# obtain the embeddings increases significantly with it.
max_length = tokenizer.model_max_length

# Create a dummy dna sequence and tokenize it
sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"].to(device)

# Compute the embeddings
attention_mask = (tokens_ids != tokenizer.pad_token_id).to(device)
torch_outs = model(
    tokens_ids,
    attention_mask=attention_mask,
    encoder_attention_mask=attention_mask,
    output_hidden_states=True
)

# Compute sequences embeddings
embeddings = torch_outs['hidden_states'][-1].detach().cpu().numpy()
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings per token: {embeddings}")

# Add embed dimension axis
attention_mask = torch.unsqueeze(attention_mask, dim=-1).cpu().numpy()

# Compute mean embeddings per sequence
mean_sequence_embeddings = np.sum(attention_mask*embeddings, axis=-2)/np.sum(attention_mask, axis=1)
print(f"Mean sequence embeddings: {mean_sequence_embeddings}")



  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Embeddings shape: (2, 2048, 1024)
Embeddings per token: [[[ 0.5063792   0.14925258  0.5267063  ... -0.601356    0.7671192
   -0.24135408]
  [ 0.3073924   0.01497287  0.31069636 ... -0.86925024  0.70535207
   -0.04934708]
  [ 0.27530265  0.23837054  0.16277047 ... -0.95569     0.6267978
   -0.07544966]
  ...
  [ 0.19590834 -0.07999256  0.1046586  ...  0.00263992  0.46054572
   -0.035153  ]
  [ 0.337595   -0.2000117   0.04658737 ...  0.02611889  0.44137904
   -0.21832097]
  [ 0.07733975 -0.34249914  0.07812422 ...  0.20739084  0.28529444
   -0.15824527]]

 [[-0.0262093  -0.5077783   0.35803428 ... -0.23095986  0.786347
   -1.0120146 ]
  [ 0.49354613 -0.59404874  0.2465686  ... -0.33584282  0.06687836
    0.25064915]
  [ 0.01741505  0.09177446  0.43233347 ... -0.2013575   0.3341403
    0.0131175 ]
  ...
  [ 0.20584804 -0.7662893   0.5937591  ... -0.5139867   0.6144493
   -1.1537112 ]
  [-0.07695537 -0.36022472 -0.3743491  ... -0.6763517   0.42619184
   -1.0293761 ]
  [-0.68401927 -0.58734

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import numpy as np

# Import the tokenizer and the model
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-multi-species")
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-multi-species").to(device)

# Choose the length to which the input sequences are padded. By default, the 
# model max length is chosen, but feel free to decrease it as the time taken to 
# obtain the embeddings increases significantly with it.
max_length = tokenizer.model_max_length

# Create a dummy dna sequence and tokenize it
sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"].to(device)

# Compute the embeddings
attention_mask = (tokens_ids != tokenizer.pad_token_id).to(device)
torch_outs = model(
    tokens_ids,
    attention_mask=attention_mask,
    encoder_attention_mask=attention_mask,
    output_hidden_states=True
)

# Compute sequences embeddings
embeddings = torch_outs['hidden_states'][-1].detach().cpu().numpy()
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings per token: {embeddings}")

# Add embed dimension axis
attention_mask = torch.unsqueeze(attention_mask, dim=-1).cpu().numpy()

# Compute mean embeddings per sequence
mean_sequence_embeddings = np.sum(attention_mask*embeddings, axis=-2)/np.sum(attention_mask, axis=1)
print(f"Mean sequence embeddings: {mean_sequence_embeddings}")


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:03<00:00,  1.74s/it]


Embeddings shape: (2, 1000, 2560)
Embeddings per token: [[[-0.00180363  0.07674704  0.01244043 ...  0.00112825  0.03381748
    0.17844304]
  [-0.09584967  0.2575793  -0.0365719  ... -0.44206974  0.06707363
    0.22186989]
  [-0.25077254  0.07775501 -0.05562799 ... -0.06127694  0.16998267
    0.2654602 ]
  ...
  [-0.06766268  0.0765079   0.00185914 ...  0.02911628  0.15405582
    0.14926068]
  [-0.06766268  0.0765079   0.00185914 ...  0.02911628  0.15405582
    0.14926068]
  [-0.06766268  0.0765079   0.00185914 ...  0.02911628  0.15405582
    0.14926068]]

 [[ 0.01480382  0.09288551  0.02611541 ... -0.0242852   0.02126851
    0.20392066]
  [-0.24921286  0.5893351   0.440742   ... -0.39238933  0.5610121
    0.41510585]
  [-0.36850718  0.32135677  0.3190047  ... -0.4604549   0.36477548
    0.5401475 ]
  ...
  [ 0.01522855  0.08277307 -0.08055211 ... -0.10329894  0.16529028
    0.21183467]
  [ 0.01522855  0.08277307 -0.08055211 ... -0.10329894  0.16529028
    0.21183467]
  [ 0.01522855  0.

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# ==== Load model and tokenizer ====
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True).to(device).eval()

# ==== Input DNA sequences ====
ref_seq = "ATGCGTACGATCGTACGATCGTACG"
mut_seq = "ATGCGTACGATTGTACGATCGTACG"   # M·ªôt bi·∫øn d·ªã nh·ªè so v·ªõi ref_seq

# ==== Tokenize & encode ====
inputs = tokenizer([ref_seq, mut_seq], return_tensors="pt", padding=True).to(device)

# ==== Forward pass ====
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    embeds = outputs.hidden_states[-1].mean(dim=1)  # Mean pooling to√†n chu·ªói

# ==== Compute variant effect ====
# Vector hi·ªáu gi·ªØa embedding c·ªßa chu·ªói ƒë·ªôt bi·∫øn v√† tham chi·∫øu
delta = embeds[1] - embeds[0]
score = torch.norm(delta, p=2).item()

print(f"Variant effect score: {score:.4f}")




Variant effect score: 2.7199


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score
from tqdm import tqdm
import numpy as np
import pandas as pd

# ==============================
# 1Ô∏è‚É£ Setup model & tokenizer
# ==============================
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True).to(device).eval()

# ==============================
# 2Ô∏è‚É£ Load benchmark dataset
# ==============================
dataset = load_dataset("InstaDeepAI/nucleotide_transformer_downstream_tasks_revised")
task_names = list(set(dataset["train"]["task"]))  # L·∫•y danh s√°ch 18 task

print(f"‚úÖ Found {len(task_names)} downstream tasks:")
print(task_names)

# ==============================
# 3Ô∏è‚É£ H√†m tr√≠ch xu·∫•t embedding
# ==============================
@torch.no_grad()
def get_embeddings(seqs, batch_size=4):
    """Tr√≠ch embedding mean-pooling cho list chu·ªói DNA"""
    all_embs = []
    for i in tqdm(range(0, len(seqs), batch_size), desc="Embedding batch"):
        batch = seqs[i:i+batch_size]
        tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        outs = model(**tokens, output_hidden_states=True)
        emb = outs.hidden_states[-1].mean(dim=1).cpu()  # mean pooling to√†n chu·ªói
        all_embs.append(emb)
    return torch.cat(all_embs, dim=0)

# ==============================
# 4Ô∏è‚É£ Pipeline benchmark
# ==============================
results = {}

for task in task_names:
    print(f"\nüöÄ Benchmarking task: {task}")

    # --- L·ªçc d·ªØ li·ªáu theo task ---
    data = dataset["train"].filter(lambda x: x["task"] == task)
    seqs = data["sequence"]
    labels = np.array(data["label"])

    # --- Tr√≠ch xu·∫•t embedding ---
    embs = get_embeddings(seqs, batch_size=8).numpy()

    # --- Cross-validation 10-fold ---
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = {"MCC": [], "F1": [], "ACC": []}

    for train_idx, test_idx in kf.split(embs):
        X_train, X_test = embs[train_idx], embs[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]

        clf = LogisticRegression(max_iter=1000, solver="lbfgs")
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)

        scores["MCC"].append(matthews_corrcoef(y_test, preds))
        scores["F1"].append(f1_score(y_test, preds, average="macro"))
        scores["ACC"].append(accuracy_score(y_test, preds))

    # --- Trung b√¨nh ƒëi·ªÉm ---
    results[task] = {m: np.mean(v) for m, v in scores.items()}
    print(f"‚úÖ {task}: MCC={results[task]['MCC']:.3f}, F1={results[task]['F1']:.3f}, ACC={results[task]['ACC']:.3f}")

# ==============================
# 5Ô∏è‚É£ Xu·∫•t k·∫øt qu·∫£ t·ªïng h·ª£p
# ==============================
df = pd.DataFrame(results).T
#df.to_csv("nt_benchmark_results.csv")
print("\nüéØ Benchmark ho√†n t·∫•t! K·∫øt qu·∫£ l∆∞u t·∫°i: nt_benchmark_results.csv")
print(df)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


‚úÖ Found 18 downstream tasks:
['H3K4me3', 'enhancers_types', 'H3K4me1', 'splice_sites_acceptors', 'promoter_tata', 'H2AFZ', 'H3K27ac', 'H3K9ac', 'promoter_no_tata', 'H3K36me3', 'enhancers', 'splice_sites_all', 'promoter_all', 'H3K27me3', 'H3K4me2', 'splice_sites_donors', 'H3K9me3', 'H4K20me1']

üöÄ Benchmarking task: H3K4me3


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 319384.53 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2184/2184 [02:20<00:00, 15.55it/s]


‚úÖ H3K4me3: MCC=0.644, F1=0.821, ACC=0.821

üöÄ Benchmarking task: enhancers_types


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 341643.00 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [01:52<00:00, 33.45it/s]


‚úÖ enhancers_types: MCC=0.416, F1=0.500, ACC=0.695

üöÄ Benchmarking task: H3K4me1


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 357895.66 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [04:00<00:00, 15.59it/s]


‚úÖ H3K4me1: MCC=0.407, F1=0.703, ACC=0.703

üöÄ Benchmarking task: splice_sites_acceptors


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 328779.45 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [02:19<00:00, 26.88it/s]


‚úÖ splice_sites_acceptors: MCC=0.464, F1=0.732, ACC=0.732

üöÄ Benchmarking task: promoter_tata


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 338114.98 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 633/633 [00:16<00:00, 37.44it/s]


‚úÖ promoter_tata: MCC=0.633, F1=0.816, ACC=0.816

üöÄ Benchmarking task: H2AFZ


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 350564.43 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [03:59<00:00, 15.67it/s]


‚úÖ H2AFZ: MCC=0.406, F1=0.703, ACC=0.703

üöÄ Benchmarking task: H3K27ac


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 335879.52 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [03:59<00:00, 15.66it/s]


‚úÖ H3K27ac: MCC=0.432, F1=0.716, ACC=0.716

üöÄ Benchmarking task: H3K9ac


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 334937.84 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2910/2910 [03:05<00:00, 15.67it/s]


‚úÖ H3K9ac: MCC=0.492, F1=0.746, ACC=0.746

üöÄ Benchmarking task: promoter_no_tata


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 333924.72 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [01:39<00:00, 37.69it/s]


‚úÖ promoter_no_tata: MCC=0.738, F1=0.867, ACC=0.868

üöÄ Benchmarking task: H3K36me3


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 336985.06 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [03:57<00:00, 15.78it/s]


‚úÖ H3K36me3: MCC=0.513, F1=0.756, ACC=0.756

üöÄ Benchmarking task: enhancers


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 347638.11 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [01:49<00:00, 34.22it/s]


‚úÖ enhancers: MCC=0.451, F1=0.725, ACC=0.725

üöÄ Benchmarking task: splice_sites_all


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 347855.20 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [02:17<00:00, 27.25it/s]


‚úÖ splice_sites_all: MCC=0.366, F1=0.576, ACC=0.577

üöÄ Benchmarking task: promoter_all


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 342281.44 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [01:39<00:00, 37.79it/s]


‚úÖ promoter_all: MCC=0.717, F1=0.857, ACC=0.857

üöÄ Benchmarking task: H3K27me3


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 344383.53 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [03:58<00:00, 15.71it/s]


‚úÖ H3K27me3: MCC=0.529, F1=0.764, ACC=0.764

üöÄ Benchmarking task: H3K4me2


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 349341.84 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [03:58<00:00, 15.72it/s]


‚úÖ H3K4me2: MCC=0.482, F1=0.741, ACC=0.741

üöÄ Benchmarking task: splice_sites_donors


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 345751.79 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [02:17<00:00, 27.30it/s]


‚úÖ splice_sites_donors: MCC=0.524, F1=0.762, ACC=0.762

üöÄ Benchmarking task: H3K9me3


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 346728.56 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3430/3430 [03:37<00:00, 15.73it/s]


‚úÖ H3K9me3: MCC=0.266, F1=0.632, ACC=0.633

üöÄ Benchmarking task: H4K20me1


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493242/493242 [00:01<00:00, 344962.99 examples/s]
Embedding batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3750/3750 [03:58<00:00, 15.71it/s]


‚úÖ H4K20me1: MCC=0.585, F1=0.792, ACC=0.792

üéØ Benchmark ho√†n t·∫•t! K·∫øt qu·∫£ l∆∞u t·∫°i: nt_benchmark_results.csv
                             MCC        F1       ACC
H3K4me3                 0.643520  0.820750  0.821102
enhancers_types         0.415666  0.500398  0.695400
H3K4me1                 0.406888  0.702627  0.702967
splice_sites_acceptors  0.464088  0.731991  0.732067
promoter_tata           0.632878  0.815886  0.816278
H2AFZ                   0.406415  0.703112  0.703167
H3K27ac                 0.432353  0.716110  0.716167
H3K9ac                  0.492102  0.745833  0.745940
promoter_no_tata        0.738042  0.867467  0.867733
H3K36me3                0.512624  0.756067  0.756133
enhancers               0.450917  0.724940  0.725167
splice_sites_all        0.366398  0.576175  0.577267
promoter_all            0.716736  0.856815  0.857067
H3K27me3                0.529424  0.764217  0.764333
H3K4me2                 0.481797  0.740797  0.740833
splice_sites_donors     0.524

In [None]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments
)
from datasets import load_dataset
import torch

# =========================
# 1Ô∏è‚É£ Setup model & tokenizer
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # ƒëi·ªÅu ch·ªânh theo task (binary ho·∫∑c multi-class)
    trust_remote_code=True
).to(device)

# =========================
# 2Ô∏è‚É£ Load 1 downstream task
# =========================
dataset = load_dataset("InstaDeepAI/nucleotide_transformer_downstream_tasks_revised")
task_name = "promoter_all"  # v√≠ d·ª•
task_ds = dataset["train"].filter(lambda x: x["task"] == task_name)

# T√°ch train/test
train_size = int(0.8 * len(task_ds))
train_ds = task_ds.select(range(train_size))
test_ds = task_ds.select(range(train_size, len(task_ds)))

# =========================
# 3Ô∏è‚É£ Tokenization
# =========================
def tokenize_fn(batch):
    return tokenizer(batch["sequence"], padding="max_length", truncation=True)

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# =========================
# 4Ô∏è‚É£ Define training arguments
# =========================
'''training_args = TrainingArguments(
    output_dir="./nt_finetune_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    fp16=True,  # b·∫≠t mixed precision n·∫øu c√≥ GPU h·ªó tr·ª£
    save_total_limit=2
)'''
training_args = TrainingArguments(
    output_dir="./nt_finetune_results",
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
)
# =========================
# 5Ô∏è‚É£ Define metrics
# =========================
from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
        "mcc": matthews_corrcoef(labels, preds),
    }

# =========================
# 6Ô∏è‚É£ Fine-tune model
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

# =========================
# 7Ô∏è‚É£ Evaluate
# =========================
results = trainer.evaluate()
print(f"‚úÖ Fine-tune done for task {task_name}")
print(results)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-v2-500m-multi-species were not used when initializing EsmForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing EsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-v2-500m-multi-species and

Step,Training Loss
