In [6]:
import os
import platform
import random
import json
from abc import ABC, abstractmethod
IS_MAC = platform.system() == "Darwin"
HAS_CUDA = False if IS_MAC else torch.cuda.is_available()

class ModelLoader(ABC):
    @abstractmethod
    def load_model(self, model_name: str, adapter_path: str = None):
        pass

    @abstractmethod
    def generate(self, model, tokenizer, prompt: str, max_tokens: int = 500):
        pass

class MLXLoader(ModelLoader):
    def load_model(self, model_name, adapter_path=None):
        from mlx_lm import load
        return load(model_name, adapter_path=adapter_path)
    
    def generate(self, model, tokenizer, prompt, max_tokens=500):
        from mlx_lm import generate
        return generate(model, tokenizer, prompt=prompt, max_tokens=max_tokens)

class CUDALoader(ModelLoader):
    def load_model(self, model_name, adapter_path=None):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if adapter_path:
            from peft import PeftModel
            model = PeftModel.from_pretrained(model, adapter_path)
            
        return model, tokenizer
    
    def generate(self, model, tokenizer, prompt, max_tokens=500):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_model_loader():
    if IS_MAC:
        return MLXLoader()
    return CUDALoader()

In [7]:
def read_first_message():
    with open('../data/poker-postflop/test.jsonl', 'r') as f:
        first_line = f.readline()
        message = json.loads(first_line)
        message = message['messages']
        return message


def read_random_message():
    messages = []
    with open('../data/poker-preflop/test.jsonl', 'r') as f:
        messages = [line for line in f]
    random_line = random.choice(messages)
    message = json.loads(random_line)
    message = message['messages']
    return message

message = read_random_message()
# print(message[-1])
message.pop()

{'role': 'assistant', 'content': 'check'}

In [8]:
# 自動選擇合適的加載器
loader = get_model_loader()

# 加載模型
model, tokenizer = loader.load_model(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    adapter_path="../adapters/total-llama-3.1-8B-Instruct"
)

# 生成文本
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True)
response = loader.generate(model, tokenizer, prompt)
print(response)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 159094.29it/s]


check


In [25]:
message2 = read_random_message()
print(message2[-1]['content'])
message2.pop()
prompt2 = tokenizer.apply_chat_template(message2, add_generation_prompt=True)

response = loader.generate(model, tokenizer, prompt)
print(response)

call
check


## original model


In [38]:
model, tokenizer = load("meta-llama/Meta-Llama-3.1-8B-Instruct")

# Prepare your prompt (using a chat template if desired).
# prompt_text = "" 
# messages = [{"role": "user", "content": prompt_text}]
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True)

# Generate text using the fine-tuned model (adapter is applied automatically).
response = generate(model, tokenizer, prompt=prompt,
                    max_tokens=500, verbose=True)

Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 132578.57it/s]


Call.
Prompt: 264 tokens, 457.508 tokens-per-sec
Generation: 3 tokens, 13.068 tokens-per-sec
Peak memory: 32.135 GB


# fine tuned version

In [39]:
# Load the base model and apply the adapter weights from the safetensors file.
model, tokenizer = load("meta-llama/Meta-Llama-3.1-8B-Instruct",
                        adapter_path="../adapters/total-llama-3.1-8B-Instruct")
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True)

# Generate text using the fine-tuned model (adapter is applied automatically).
response = generate(model, tokenizer, prompt=prompt,
                    max_tokens=500, verbose=True)

Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 137313.52it/s]


fold
Prompt: 264 tokens, 444.232 tokens-per-sec
Generation: 2 tokens, 3.733 tokens-per-sec
Peak memory: 32.135 GB


In [70]:
message2 = read_random_message()
print(message2[-1]['content'])
message2.pop()
prompt2 = tokenizer.apply_chat_template(message2, add_generation_prompt=True)

response = generate(model, tokenizer, prompt=prompt2,
                    max_tokens=500, verbose=True)

fold
fold
Prompt: 269 tokens, 462.610 tokens-per-sec
Generation: 2 tokens, 34.122 tokens-per-sec
Peak memory: 32.135 GB
