# Run train_base.py (LoRA / QLoRA) from Drive
本 Notebook 包含以下步骤：
1. 挂载 Google Drive（读取你已保存的 train_base.py / Configs / Data）
2. 安装依赖（transformers, datasets, peft, bitsandbytes, accelerate, huggingface_hub）
3. 设置 HF_TOKEN（从 Colab Secrets / 交互输入）
4. 做一个小规模 debug 子集并运行 train_base.py 进行快速 smoke-test
5.（可选）运行完整训练

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 修改为你自己的项目根目录（与 train_base.py 写入位置一致）
BASE_DIR = "/content/drive/MyDrive/AIAA3102/Final_Project"   # <- 如果各位的路径不同，请修改
SCRIPTS_DIR = f"{BASE_DIR}/Scripts"
CONFIGS_DIR = f"{BASE_DIR}/Configs"
DATA_DIR = f"{BASE_DIR}/Data"
MODELS_DIR = f"{BASE_DIR}/Models"

print("BASE_DIR =", BASE_DIR)
!ls -la "{BASE_DIR}"


# Install dependencies and import libraries

In [None]:
!pip install transformers datasets peft bitsandbytes accelerate huggingface_hub
# 需要一乃乃时间

# Check Cuda

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


# Ensure the configs/script/py files

In [None]:
import os
paths = {
    "script": f"{SCRIPTS_DIR}/train_base.py",
    "configs": CONFIGS_DIR,
    "train": f"{DATA_DIR}/train.jsonl",
    "valid": f"{DATA_DIR}/valid.jsonl",
    "unknown": f"{DATA_DIR}/unknown_test.jsonl",
}
for k, p in paths.items():
    print(k, "exists:", os.path.exists(p), p)

# 打印前几行检查
print("\n--- train.jsonl (first 3 lines) ---")
!head -n 3 "{paths['train']}"
print("\n--- configs (list) ---")
!ls -la "{CONFIGS_DIR}"


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

<generator object Module.named_modules at 0x783262f65f10>


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rot

# Debug Dataset Generation

In [None]:
# Create tiny debug subsets to run a quick smoke-test (avoid long runs)
import shutil
from pathlib import Path
p_data = Path(DATA_DIR)
debug_train = p_data / "train_debug.jsonl"
debug_valid = p_data / "valid_debug.jsonl"

def subset(src, dst, n=20):
    with open(src, 'r', encoding='utf-8') as rf, open(dst, 'w', encoding='utf-8') as wf:
        for i, line in enumerate(rf):
            if i >= n:
                break
            wf.write(line)

subset(paths["train"], debug_train, n=20)
subset(paths["valid"], debug_valid, n=10)
print("Debug subsets created:", debug_train, debug_valid)
!wc -l "{debug_train}" "{debug_valid}"


**Training**

In [None]:
# 这个是debug的命令
# 注意：--config_dir 指向你 Drive 下的 Configs 文件夹
!python "{SCRIPTS_DIR}/train_base.py" \
  --config_dir "{CONFIGS_DIR}" \
  --train_file "{DATA_DIR}/train_debug.jsonl" \
  --valid_file "{DATA_DIR}/valid_debug.jsonl" \
  --overwrite_output_dir

# 这个是正式运行的命令
# 正式训练（按 configs 指定的超参）
# !python "{SCRIPTS_DIR}/train_base.py" \
#   --config_dir "{CONFIGS_DIR}" \
#   --train_file "{DATA_DIR}/train.jsonl" \
#   --valid_file "{DATA_DIR}/valid.jsonl" \
#   --overwrite_output_dir


# Quick Assessment

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os, json

# 根据 training_args.yaml 的 output_dir 确定保存路径，或改成你想要的 models 路径
import yaml
cfg = yaml.safe_load(open(f"{CONFIGS_DIR}/training_args.yaml"))
outdir = cfg.get("output_dir", f"{BASE_DIR}/models/finetuned_model")
print("Expecting model at:", outdir)

tokenizer = AutoTokenizer.from_pretrained(outdir)
model = AutoModelForCausalLM.from_pretrained(outdir, device_map="auto")
gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

# 测试几条 valid
with open(f"{DATA_DIR}/valid.jsonl",'r',encoding='utf-8') as f:
    lines = [json.loads(l) for l in f][:5]

for ex in lines:
    prompt = f"### 问：\n{ex['prompt']}\n### 答：\n"
    out = gen(prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
    print("="*40)
    print("PROMPT:", prompt)
    print("OUTPUT:", out)
