In [None]:
# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-1_8B", trust_remote_code=True, torch_dtype="auto")

In [None]:
import sys
print(sys.executable)
print(sys.path)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch

MODEL_PATH = ""

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",        
    torch_dtype=torch.float32,      
    trust_remote_code=True,
)

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import load_from_disk


MODEL_PATH = ""

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)


if tokenizer.pad_token is None:
    if tokenizer.eos_token is not None:
        tokenizer.pad_token = tokenizer.eos_token
    elif tokenizer.bos_token is not None:
        tokenizer.pad_token = tokenizer.bos_token
    else:
        tokenizer.pad_token = tokenizer.unk_token or "<|endoftext|>"

pad_id_list = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])
tokenizer.pad_token_id = pad_id_list[0] if isinstance(pad_id_list, list) else pad_id_list

print("PAD token fixed:", tokenizer.pad_token, tokenizer.pad_token_id)


PAD token fixed: <|endoftext|> 151643


In [None]:
from peft import LoraConfig, get_peft_model, TaskType


lora_config = LoraConfig(
    r=8,                   
    lora_alpha=16,         
    lora_dropout=0.1,     
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "c_attn",          
        "c_proj",         
        "w1", "w2", "w3"  
    ]
)


model = get_peft_model(model, lora_config)


model.print_trainable_parameters()


def safe_forward(self, *args, **kwargs):
    with torch.cuda.amp.autocast(enabled=False):  
        return self._old_forward(*args, **kwargs)

for name, module in model.named_modules():
    if "attn" in name.lower() and hasattr(module, "forward"):
        module._old_forward = module.forward
        module.forward = safe_forward.__get__(module, module.__class__)

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files=r"tiki_training_dataset.json")

def format_example(example):
    messages = example["messages"]
    user_message = next((m["content"] for m in messages if m["role"] == "user"), "")
    assistant_message = next((m["content"] for m in messages if m["role"] == "assistant"), "")
    text = f"### Khách hỏi:\n{user_message}\n\n### Trả lời:\n{assistant_message}"
    tokenized = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


tokenized_dataset = dataset["train"].map(format_example)

split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("Train size:", len(train_dataset))
print("Eval size:", len(eval_dataset))
print("Columns:", train_dataset.column_names)


print("Dataset tokenized:", tokenized_dataset)


Train size: 8152
Eval size: 906
Columns: ['messages', 'input_ids', 'token_type_ids', 'attention_mask', 'labels']
Dataset tokenized: Dataset({
    features: ['messages', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 9058
})


In [5]:
from transformers import TrainerCallback
from tqdm.auto import tqdm
import math
import time

class ProgressCallbackCustom(TrainerCallback):
    def __init__(self):
        self.start_time = time.time()

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return

        step = state.global_step or 0
        total_steps = state.max_steps or 1
        epoch = state.epoch or 0

       
        loss = logs.get("loss", 0) or 0
        lr = logs.get("learning_rate", 0) or 0

        
        elapsed = time.time() - self.start_time
        speed = step / elapsed if elapsed > 0 else 0

        
        eta = (total_steps - step) / speed if speed > 0 else float("inf")

        
        tqdm.write(
            f"\rStep {step}/{total_steps} | loss={loss:.4f} | lr={lr:.2e} | epoch={epoch:.2f} | "
            f"{speed:.2f} it/s | ETA={eta/60:.1f} min",
            end=""
        )


In [6]:
training_args = TrainingArguments(
    output_dir=r"C:\train_ouput",
    per_device_train_batch_size=1,
    overwrite_output_dir=True,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    bf16=False,
    fp16=False,
    logging_steps=800,
    report_to="none",
    
    
    resume_from_checkpoint=False,
    evaluation_strategy="steps",
    eval_steps=800,
    save_strategy="no",      
    save_safetensors=True,   
)


In [6]:
with torch.no_grad():
    model(input_ids=torch.randint(0, tokenizer.vocab_size, (1, 10)).cuda())


  attn_output = F.scaled_dot_product_attention(


In [11]:
import shutil, os

output_dir = r"C:\train_ouput"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)  
print("Đã xóa thư mục output cũ, sẵn sàng train mới!")


Đã xóa thư mục output cũ, sẵn sàng train mới!


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    callbacks=[ProgressCallbackCustom]  
)

trainer.train()


 12%|█▏        | 800/6792 [12:57<1:37:03,  1.03it/s]

Step 800/6792 | loss=0.0788 | lr=1.76e-04 | epoch=0.35 | 1.03 it/s | ETA=97.1 min{'loss': 0.0788, 'learning_rate': 0.00017644287396937575, 'epoch': 0.35}



 12%|█▏        | 800/6792 [16:29<1:37:03,  1.03it/s]
 12%|█▏        | 800/6792 [16:29<1:37:03,  1.03it/s]

Step 800/6792 | loss=0.0000 | lr=0.00e+00 | epoch=0.35 | 0.81 it/s | ETA=123.6 min{'eval_loss': 0.06640625, 'eval_runtime': 212.1745, 'eval_samples_per_second': 4.27, 'eval_steps_per_second': 0.537, 'epoch': 0.35}


 24%|██▎       | 1600/6792 [29:27<1:24:14,  1.03it/s] 

Step 1600/6792 | loss=0.0674 | lr=1.53e-04 | epoch=0.71 | 0.91 it/s | ETA=95.6 min{'loss': 0.0674, 'learning_rate': 0.00015288574793875147, 'epoch': 0.71}



 24%|██▎       | 1600/6792 [32:57<1:24:14,  1.03it/s]
 24%|██▎       | 1600/6792 [32:57<1:24:14,  1.03it/s]

Step 1600/6792 | loss=0.0000 | lr=0.00e+00 | epoch=0.71 | 0.81 it/s | ETA=107.0 min{'eval_loss': 0.05069555714726448, 'eval_runtime': 210.7599, 'eval_samples_per_second': 4.299, 'eval_steps_per_second': 0.541, 'epoch': 0.71}


 35%|███▌      | 2400/6792 [45:55<1:11:07,  1.03it/s] 

Step 2400/6792 | loss=0.0525 | lr=1.29e-04 | epoch=1.06 | 0.87 it/s | ETA=84.1 min{'loss': 0.0525, 'learning_rate': 0.00012932862190812722, 'epoch': 1.06}



 35%|███▌      | 2400/6792 [49:26<1:11:07,  1.03it/s]
 35%|███▌      | 2400/6792 [49:26<1:11:07,  1.03it/s]

Step 2400/6792 | loss=0.0000 | lr=0.00e+00 | epoch=1.06 | 0.81 it/s | ETA=90.5 min{'eval_loss': 0.0417838878929615, 'eval_runtime': 210.8874, 'eval_samples_per_second': 4.296, 'eval_steps_per_second': 0.541, 'epoch': 1.06}


 47%|████▋     | 3200/6792 [1:02:24<58:08,  1.03it/s]  

Step 3200/6792 | loss=0.0413 | lr=1.06e-04 | epoch=1.41 | 0.85 it/s | ETA=70.1 min{'loss': 0.0413, 'learning_rate': 0.00010577149587750295, 'epoch': 1.41}



 47%|████▋     | 3200/6792 [1:05:55<58:08,  1.03it/s]
 47%|████▋     | 3200/6792 [1:05:55<58:08,  1.03it/s]

Step 3200/6792 | loss=0.0000 | lr=0.00e+00 | epoch=1.41 | 0.81 it/s | ETA=74.0 min{'eval_loss': 0.03741064295172691, 'eval_runtime': 210.7703, 'eval_samples_per_second': 4.299, 'eval_steps_per_second': 0.541, 'epoch': 1.41}


 59%|█████▉    | 4000/6792 [1:18:53<45:16,  1.03it/s]   

Step 4000/6792 | loss=0.0390 | lr=8.22e-05 | epoch=1.77 | 0.84 it/s | ETA=55.1 min{'loss': 0.039, 'learning_rate': 8.221436984687868e-05, 'epoch': 1.77}



 59%|█████▉    | 4000/6792 [1:22:23<45:16,  1.03it/s]
 59%|█████▉    | 4000/6792 [1:22:23<45:16,  1.03it/s]

Step 4000/6792 | loss=0.0000 | lr=0.00e+00 | epoch=1.77 | 0.81 it/s | ETA=57.5 min{'eval_loss': 0.03368871286511421, 'eval_runtime': 210.8388, 'eval_samples_per_second': 4.297, 'eval_steps_per_second': 0.541, 'epoch': 1.77}


 71%|███████   | 4800/6792 [1:35:21<32:21,  1.03it/s]   

Step 4800/6792 | loss=0.0378 | lr=5.87e-05 | epoch=2.12 | 0.84 it/s | ETA=39.6 min{'loss': 0.0378, 'learning_rate': 5.8657243816254415e-05, 'epoch': 2.12}



 71%|███████   | 4800/6792 [1:38:52<32:21,  1.03it/s]
 71%|███████   | 4800/6792 [1:38:52<32:21,  1.03it/s]

Step 4800/6792 | loss=0.0000 | lr=0.00e+00 | epoch=2.12 | 0.81 it/s | ETA=41.0 min{'eval_loss': 0.031917210668325424, 'eval_runtime': 210.5355, 'eval_samples_per_second': 4.303, 'eval_steps_per_second': 0.541, 'epoch': 2.12}


 82%|████████▏ | 5600/6792 [1:51:49<19:17,  1.03it/s]   

Step 5600/6792 | loss=0.0335 | lr=3.51e-05 | epoch=2.47 | 0.83 it/s | ETA=23.8 min{'loss': 0.0335, 'learning_rate': 3.510011778563015e-05, 'epoch': 2.47}



 82%|████████▏ | 5600/6792 [1:55:20<19:17,  1.03it/s]
 82%|████████▏ | 5600/6792 [1:55:20<19:17,  1.03it/s]

Step 5600/6792 | loss=0.0000 | lr=0.00e+00 | epoch=2.47 | 0.81 it/s | ETA=24.6 min{'eval_loss': 0.03123464062809944, 'eval_runtime': 210.5136, 'eval_samples_per_second': 4.304, 'eval_steps_per_second': 0.542, 'epoch': 2.47}


 94%|█████████▍| 6400/6792 [2:08:17<06:20,  1.03it/s]   

Step 6400/6792 | loss=0.0328 | lr=1.15e-05 | epoch=2.83 | 0.83 it/s | ETA=7.9 min{'loss': 0.0328, 'learning_rate': 1.154299175500589e-05, 'epoch': 2.83}



 94%|█████████▍| 6400/6792 [2:11:47<06:20,  1.03it/s]
 94%|█████████▍| 6400/6792 [2:11:47<06:20,  1.03it/s]

Step 6400/6792 | loss=0.0000 | lr=0.00e+00 | epoch=2.83 | 0.81 it/s | ETA=8.1 min{'eval_loss': 0.030994271859526634, 'eval_runtime': 210.4773, 'eval_samples_per_second': 4.305, 'eval_steps_per_second': 0.542, 'epoch': 2.83}


100%|██████████| 6792/6792 [2:18:08<00:00,  1.22s/it]  

Step 6792/6792 | loss=0.0000 | lr=0.00e+00 | epoch=3.00 | 0.82 it/s | ETA=0.0 min{'train_runtime': 8288.7461, 'train_samples_per_second': 3.278, 'train_steps_per_second': 0.819, 'train_loss': 0.0469944979754157, 'epoch': 3.0}





TrainOutput(global_step=6792, training_loss=0.0469944979754157, metrics={'train_runtime': 8288.7461, 'train_samples_per_second': 3.278, 'train_steps_per_second': 0.819, 'train_loss': 0.0469944979754157, 'epoch': 3.0})

In [None]:
model.save_pretrained("./qwen-lora-finetuned")
tokenizer.save_pretrained("./qwen-lora-finetuned")

In [14]:
metrics = trainer.evaluate()
print(metrics)


100%|██████████| 114/114 [03:30<00:00,  1.84s/it]

Step 6792/6792 | loss=0.0000 | lr=0.00e+00 | epoch=3.00 | 0.59 it/s | ETA=0.0 min{'eval_loss': 0.03098025918006897, 'eval_runtime': 210.9383, 'eval_samples_per_second': 4.295, 'eval_steps_per_second': 0.54, 'epoch': 3.0}



