# test Gemma from Kaggle

In [None]:
# # Setup the environment
# !pip install -q -U immutabledict sentencepiece 
# !git clone https://github.com/google/gemma_pytorch.git
# !mkdir /kaggle/working/gemma/
# !mv /kaggle/working/gemma_pytorch/gemma/* /kaggle/working/gemma/

In [1]:
import sys 
sys.path.append("gemma_pytorch-main") 
from gemma.config import GemmaConfig, get_model_config
from gemma.model import GemmaForCausalLM
from gemma.tokenizer import Tokenizer
import contextlib
import os
import torch

In [2]:
# Load the model
VARIANT = "9b" 
MACHINE_TYPE = "cuda" 
weights_dir = '/root/autodl-tmp/gemma-2-pytorch-gemma-2-9b-pt-v1' 

@contextlib.contextmanager
def _set_default_tensor_type(dtype: torch.dtype):
  """Sets the default torch dtype to the given dtype."""
  torch.set_default_dtype(dtype)
  yield
  torch.set_default_dtype(torch.float)

model_config = get_model_config(VARIANT)
model_config.tokenizer = os.path.join(weights_dir, "tokenizer.model")

device = torch.device(MACHINE_TYPE)
with _set_default_tensor_type(model_config.get_dtype()):
  model = GemmaForCausalLM(model_config)
  model.load_weights(weights_dir)
  model = model.to(device).eval()

In [9]:
# Use the model

USER_CHAT_TEMPLATE = "<start_of_turn>user\n{prompt}<end_of_turn><eos>\n"
MODEL_CHAT_TEMPLATE = "<start_of_turn>model\n{prompt}<end_of_turn><eos>\n"

prompt = (
    USER_CHAT_TEMPLATE.format(prompt="What is a good place for travel in the US?")
    + MODEL_CHAT_TEMPLATE.format(prompt="California.")
    # 开启新的对话
    + USER_CHAT_TEMPLATE.format(prompt="What can I do in California?")
    + "<start_of_turn>model\n" # 等待模型生成回答
)

model.generate(
    USER_CHAT_TEMPLATE.format(prompt=prompt),
    device=device,
    output_len=100,
)

ValueError: The following `model_kwargs` are not used by the model: ['device', 'output_len'] (note: typos in the generate arguments will also show up in this list)

# tokenizer

In [None]:
from transformers import AutoTokenizer

# 初始化Gemma tokenizer
tokenizer_path = "gemma_pytorch-main/tokenizer/tokenizer.model"
gemma_tokenizer = Tokenizer(tokenizer_path)

# 初始化HuggingFace tokenizer
# hf_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")

# 示例使用方法
def tokenize_text(text, use_hf=False):
    if use_hf:
        # 使用HuggingFace tokenizer
        tokens = hf_tokenizer(text, return_tensors="pt")
        return tokens
    else:
        # 使用Gemma原生tokenizer
        tokens = gemma_tokenizer.encode(text)
        return tokens

In [None]:
tokenize_text("Hello, how are you?", use_hf=False)

# load dataset

In [1]:
# AutoDL官方学术资源加速
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [None]:
from datasets import load_dataset

ds = load_dataset("SetFit/mrpc")

# download model

In [4]:
from huggingface_hub import login
# 替换 YOUR_TOKEN 为你的 Hugging Face token
# 可以从 https://huggingface.co/settings/tokens 获取
login(token="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", write_permission=False)

In [2]:
# import torch
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # 指定自定义下载路径
# cache_dir = "/root/autodl-tmp/gemma"  # 替换为你想要的路径

# tokenizer = AutoTokenizer.from_pretrained(
#     "google/gemma-2-9b",
#     cache_dir=cache_dir
# )
# # model = AutoModelForCausalLM.from_pretrained(
# #     "google/gemma-2-9b",
# #     cache_dir=cache_dir
# # )
# # 使用 4-bit 量化加载模型
# model = AutoModelForCausalLM.from_pretrained(
#     "google/gemma-2-9b",
#     cache_dir=cache_dir,
#     device_map="auto",
#     load_in_4bit=True,  # 4-bit 量化可以将显存需求降至约 8GB
#     torch_dtype=torch.float16
# )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig  # 导入 BitsAndBytesConfig

# 清理 GPU 缓存
torch.cuda.empty_cache()

# 配置4-bit量化参数
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"  # 使用 normal float 4 量化类型
)

# 指定自定义下载路径
cache_dir = "/root/autodl-tmp/gemma"

tokenizer = AutoTokenizer.from_pretrained(
    "google/gemma-2-9b",
    cache_dir=cache_dir
)

# 使用新的量化配置方式加载模型
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b",
    cache_dir=cache_dir,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config  # 使用量化配置
)

In [14]:
def chat_with_gemma(prompt, max_length=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        # 添加以下参数来更好地控制生成
        max_new_tokens=50,  # 限制新生成的token数量
        eos_token_id=tokenizer.eos_token_id,  # 设置结束标记
        early_stopping=True  # 启用早停
    )
    
    # 只返回模型的回答部分
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # 如果回答中包含了用户输入，只保留模型回答部分
    if "<start_of_turn>user" in response:
        response = response.split("<start_of_turn>user")[0]
    
    return response.strip()

# 测试对话
# prompt = """<start_of_turn>user
# 你好，请介绍一下自己。
# <end_of_turn>
# <start_of_turn>model"""

response = chat_with_gemma(prompt)
print("模型回答:", response)

Both `max_new_tokens` (=50) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


模型回答: user
What can I do in California?
model
What can I do in California?
What can I do in California?
What can I do in California?
What can I do in California?
What can I do in California?
What can I do in California?
What can


In [13]:
# Use the model

USER_CHAT_TEMPLATE = "<start_of_turn>user\n{prompt}<end_of_turn><eos>\n"
MODEL_CHAT_TEMPLATE = "<start_of_turn>model\n{prompt}<end_of_turn><eos>\n"

prompt = (
    USER_CHAT_TEMPLATE.format(prompt="What is a good place for travel in the US?")
    + MODEL_CHAT_TEMPLATE.format(prompt="California.")
    # 开启新的对话
    + USER_CHAT_TEMPLATE.format(prompt="What can I do in California?")
    + "<start_of_turn>model\n" # 等待模型生成回答
)

# model.generate(
#     USER_CHAT_TEMPLATE.format(prompt=prompt),
# )

# p-tuning

In [None]:
from peft import PromptEncoderConfig, get_peft_model

# 将 GemmaConfig 转换为字典格式
model_config_dict = {
    "num_hidden_layers": model_config.num_hidden_layers,
    "hidden_size": model_config.hidden_size,
    "num_attention_heads": model_config.num_attention_heads
}

peft_config = PromptEncoderConfig(
    task_type="CAUSAL_LM",  # 改为因果语言模型任务
    num_virtual_tokens=20,
    encoder_hidden_size=model_config.hidden_size,  # 使用与基础模型相同的隐藏层大小
    encoder_num_layers=2
)

# 使用转换后的配置
model = get_peft_model(model, peft_config, model_config_dict)
model.print_trainable_parameters()

In [None]:
help(model_config)