<a href="https://colab.research.google.com/github/DELEnomore/LLM/blob/colab/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install datasets



In [15]:
import os.path
from abc import abstractmethod

import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
import os
from google.colab import drive
from transformers import pipeline
import logging


In [4]:
os.environ["WANDB_DISABLED"] = "true"

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
GOOGLE_DRIVE_WORKSPACE_DIR = 'drive/MyDrive/colab_workspace/LLM'
_CACHE_DIR = GOOGLE_DRIVE_WORKSPACE_DIR + '/cache'
MODEL_CACHE_DIR = _CACHE_DIR + '/model'
DATASET_CACHE_DIR = _CACHE_DIR + '/dataset'

MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
MODEL_CHECKPOINT_DIR = GOOGLE_DRIVE_WORKSPACE_DIR + '/model_output/' + MODEL_NAME
MODEL_OUTPUT_DIR = MODEL_CHECKPOINT_DIR + '/best_model'

In [7]:
def format_chatml(batch_input, batch_output):
    batch_message = []
    for instruction, response in zip(batch_input, batch_output):
        messages = [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": response}
        ]
        batch_message.append(messages)

    return batch_message

In [8]:
class DatasetInterface:
    PATH = ''

    NAME = None

    def __init__(self, tokenizer):
        self.data = load_dataset(self.PATH, self.NAME, cache_dir=DATASET_CACHE_DIR)
        self.tokenizer = tokenizer

    def get_data(self):
        return self.data

    @abstractmethod
    def _get_input_and_output(self, example):
        pass

    def tokenize_function_4_auto_ml(self, example):
        input, output = self._get_input_and_output(example)
        chatml = format_chatml(input, output)
        formated_input = tokenizer.apply_chat_template(
            chatml,
            tokenize=False,
        )
        model_input = tokenizer(formated_input, padding="max_length", truncation=True, max_length=128)
        model_input["labels"] = model_input["input_ids"].copy()
        return model_input

In [9]:
class TestDataset(DatasetInterface):

    PATH = 'FuYuwen117/test'

    def _get_input_and_output(self, example):
        input = example['input']
        target = example['output']
        return input, target


In [11]:

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME, cache_dir=MODEL_CACHE_DIR)
tokenizer.pad_token = tokenizer.eos_token

dataset_instance = TestDataset(tokenizer)
data = dataset_instance.get_data()

tokenized_dataset = data.map(dataset_instance.tokenize_function_4_auto_ml, num_proc=4, batched=True)
splited_dataset = tokenized_dataset['train'].train_test_split(test_size=0.2)
train_dataset = splited_dataset['train']
val_dataset = splited_dataset['test']
# Ê£ÄÊü•Âπ∂Ê∑ªÂä†Â°´ÂÖÖÊ†áËÆ∞

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Ê∑∑ÂêàÁ≤æÂ∫¶
    device_map="auto",  # Ëá™Âä®ÂàÜÈÖçÂà∞ GPU
    cache_dir=MODEL_CACHE_DIR
)




Repo card metadata block was not found. Setting CardData to empty.


Map (num_proc=4):   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
lora_config = LoraConfig(
    r=8,  # LoRA ÁöÑÁß©
    lora_alpha=32,  # LoRA ÁöÑÁº©ÊîæÂõ†Â≠ê
    lora_dropout=0.05,  # Dropout Ê¶ÇÁéá
    bias="none",  # LoRA bias ËÆæÁΩÆ
    task_type="CAUSAL_LM",  # ‰ªªÂä°Á±ªÂûãÔºöËá™ÂõûÂΩíÊñáÊú¨ÁîüÊàê
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    # Â¶ÇÊûúÈúÄË¶ÅÊ†πÊçÆÂÖ∑‰ΩìÊ®°ÂûãÁªìÊûÑÔºåË∞ÉÊï¥ target_modules
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()  # Êü•ÁúãÂèØËÆ≠ÁªÉÂèÇÊï∞Èáè

training_args = TrainingArguments(
    output_dir=MODEL_CHECKPOINT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=10000,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    report_to="none",
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)


trainer.train()
trainer.save_model(MODEL_OUTPUT_DIR)

trainable params: 2,179,072 || all params: 1,779,267,072 || trainable%: 0.1225


  trainer = Trainer(


Step,Training Loss,Validation Loss


In [29]:
def format_input(input_text):
    return f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n'

In [31]:
# ‰ΩøÁî®ÂéüÊúâÊ®°ÂûãËøõË°åÂØπËØù
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME)
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
original_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Ê∑∑ÂêàÁ≤æÂ∫¶
    device_map="auto",  # Ëá™Âä®ÂàÜÈÖçÂà∞ GPU
    cache_dir=MODEL_CACHE_DIR
)
original_model.eval()
pipe = pipeline(
    "text-generation",
    model=original_model,
    tokenizer=tokenizer,
)

# ‰∏éÁî®Êà∑ËøõË°åÂØπËØù
print("ÂºÄÂßãÂØπËØùÔºÅËæìÂÖ• 'exit' ÁªìÊùüÂØπËØù„ÄÇ")
while True:
    user_input = input("‰Ω†: ")
    if user_input.lower() == "exit":
        print("ÂØπËØùÁªìÊùü„ÄÇ")
        break
    formatted_input = format_input(user_input)
    print(formatted_input)
    print('-----')
    # ÁîüÊàêÊ®°ÂûãÂõûÂ§ç
    response = pipe(formatted_input)
    print('-----')
    print(f"Ê®°Âûã: {response[0]['generated_text']}")

Device set to use cuda:0


ÂºÄÂßãÂØπËØùÔºÅËæìÂÖ• 'exit' ÁªìÊùüÂØπËØù„ÄÇ
‰Ω†: ‰Ω†Â•Ω
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
‰Ω†Â•Ω<|im_end|>
<|im_start|>assistant

-----
-----
Ê®°Âûã: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
‰Ω†Â•Ω<|im_end|>
<|im_start|>assistant
ÊàëÂàöÂàöÁúã‰∫Ü‰Ω†Êèê‰æõÁöÑ‰ø°ÊÅØÔºåÁé∞Âú®ÊàëË¶ÅÂàÜÊûê‰∏Ä‰∏ãËøô‰∏™ÂØπËØùÔºåÁúãÁúãÊúâÊ≤°Êúâ‰ªÄ‰πàÈóÆÈ¢òÊàñËÄÖÈúÄË¶Å
‰Ω†: ‰Ω†Â•Ω
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
‰Ω†Â•Ω<|im_end|>
<|im_start|>assistant

-----
-----
Ê®°Âûã: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
‰Ω†Â•Ω<|im_end|>
<|im_start|>assistant
ÊÇ®Â•ΩÔºåÊúâ‰ªÄ‰πàÊàëÂèØ‰ª•Â∏ÆÂä©‰Ω†ÁöÑÂêóÔºü
<|im_end|>
<|im_start|>


KeyboardInterrupt: Interrupted by user

In [None]:
# ‰ΩøÁî®ÂæÆË∞ÉÂêéÁöÑÊ®°ÂûãËøõË°åÂØπËØù
tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_DIR)
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
finetuned_model = AutoModelForCausalLM.from_pretrained(MODEL_OUTPUT_DIR)
finetuned_model.eval()


# ‰∏éÁî®Êà∑ËøõË°åÂØπËØù
print("ÂºÄÂßãÂØπËØùÔºÅËæìÂÖ• 'exit' ÁªìÊùüÂØπËØù„ÄÇ")
while True:
    user_input = input("‰Ω†: ")
    if user_input.lower() == "exit":
        print("ÂØπËØùÁªìÊùü„ÄÇ")
        break
    formatted_input = format_input(user_input)
    logging.info('formatted_input:%s', formatted_input)
    # ÁîüÊàêÊ®°ÂûãÂõûÂ§ç
    response = model.generate(formatted_input, max_new_tokens=200)
    print(f"Ê®°Âûã: {response[0]['generated_text']}")