<a href="https://colab.research.google.com/github/CHL-edu/backup/blob/main/deepseek1_5b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

# 挂载 Google Drive
drive.mount('/content/drive')

# 定义模型保存路径
model_save_path = '/content/drive/MyDrive/Colab Notebooks/deepseek1.5b'
os.makedirs(model_save_path, exist_ok=True)

# 检查 GPU
if not torch.cuda.is_available():
    raise RuntimeError("未检测到 GPU，请启用 GPU 运行时！")
print(f"GPU: {torch.cuda.get_device_name(0)}")

# 安装 bitsandbytes
print("安装 bitsandbytes...")
!pip install bitsandbytes

Mounted at /content/drive


In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import snapshot_download
import torch

# 下载模型
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
if not os.path.exists(os.path.join(model_save_path, "config.json")):
    print(f"下载模型到 {model_save_path}...")
    snapshot_download(repo_id=model_name, local_dir=model_save_path, local_dir_use_symlinks=False)
    print("下载完成！")
else:
    print(f"模型已存在于 {model_save_path}。")

# 加载分词器
print("加载分词器...")
tokenizer = AutoTokenizer.from_pretrained(model_save_path)

# 设置 pad_token_id
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    print(f"pad_token_id 设置为: {tokenizer.eos_token_id}")

# 配置 8-bit 量化
quant_config = BitsAndBytesConfig(load_in_8bit=True)

# 加载模型（GPU，8-bit 量化，sdpa）
print("加载模型到 GPU...")
model = AutoModelForCausalLM.from_pretrained(
    model_save_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=quant_config,
    attn_implementation="sdpa"
)

# 禁用 Sliding Window Attention
if hasattr(model.config, "sliding_window"):
    model.config.sliding_window = None
    print("已禁用 Sliding Window Attention。")

# 设置模型 pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id
print("模型加载成功！")

# 交互循环
print("\n欢迎使用 DeepSeek-1.5B！输入 '退出' 结束。")
while True:
    prompt = input("问题：")
    if prompt.strip().lower() == "退出":
        print("退出程序。")
        break

    # 编码输入
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        return_attention_mask=True
    ).to("cuda")

    # 生成输出
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=100,  # 减小长度，避免冗余
        temperature=0.6,
        top_p=0.95,
        do_sample=True,
        repetition_penalty=1.2,  # 惩罚重复内容
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    # 解码输出
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\n回答：")
    print(response)
    print("-" * 50)

# 清理显存
del model
torch.cuda.empty_cache()
print("已清理显存。")