# Run train_base.py (LoRA / QLoRA) from Drive
本 Notebook 包含以下步骤：
1. 挂载 Google Drive（读取你已保存的 train_base.py / Configs / Data）
2. 安装依赖（transformers, datasets, peft, bitsandbytes, accelerate, huggingface_hub）
3. 设置 HF_TOKEN（从 Colab Secrets / 交互输入）
4. 做一个小规模 debug 子集并运行 train_base.py 进行快速 smoke-test
5.（可选）运行完整训练

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 修改为你自己的项目根目录（与 train_base.py 写入位置一致）
BASE_DIR = "/content/drive/MyDrive/Final_Project"   # <- 如果各位的路径不同，请修改
SCRIPTS_DIR = f"{BASE_DIR}/Scripts"
CONFIGS_DIR = f"{BASE_DIR}/Configs"
DATA_DIR = f"{BASE_DIR}/Data"
MODELS_DIR = f"{BASE_DIR}/Models"

print("BASE_DIR =", BASE_DIR)
!ls -la "{BASE_DIR}"


Mounted at /content/drive
BASE_DIR = /content/drive/MyDrive/Final_Project
total 94
-rw------- 1 root root 75085 Nov  9 11:45 AIAA3102-FinalProject_Awareness_Ignorance.ipynb
drwx------ 2 root root  4096 Nov 13 10:19 Configs
drwx------ 2 root root  4096 Nov 13 10:19 Data
-rw------- 1 root root  8196 Nov 13 08:01 .DS_Store
drwx------ 2 root root  4096 Nov 13 10:19 Scripts


# Install dependencies and import libraries

In [None]:
!pip install transformers datasets peft bitsandbytes accelerate huggingface_hub
# 需要一乃乃时间

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


# Check Cuda

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


# Ensure the configs/script/py files

In [None]:
import os
paths = {
    "script": f"{SCRIPTS_DIR}/train_base.py",
    "configs": CONFIGS_DIR,
    "train": f"{DATA_DIR}/train.jsonl",
    "valid": f"{DATA_DIR}/valid.jsonl",
    "unknown": f"{DATA_DIR}/unknown_test.jsonl",
}
for k, p in paths.items():
    print(k, "exists:", os.path.exists(p), p)

# 打印前几行检查
print("\n--- train.jsonl (first 3 lines) ---")
!head -n 3 "{paths['train']}"
print("\n--- configs (list) ---")
!ls -la "{CONFIGS_DIR}"


script exists: True /content/drive/MyDrive/Final_Project/Scripts/train_base.py
configs exists: True /content/drive/MyDrive/Final_Project/Configs
train exists: True /content/drive/MyDrive/Final_Project/Data/train.jsonl
valid exists: True /content/drive/MyDrive/Final_Project/Data/valid.jsonl
unknown exists: True /content/drive/MyDrive/Final_Project/Data/unknown_test.jsonl

--- train.jsonl (first 3 lines) ---
{"prompt": "Help debug this code snippet by adding comments with the appropriate error messages.\ndef greetings(name):\nnum = 5\nmessage = \"Hello, \" + name + \"!\"\nreturn message", "response": "def greetings(name):\n    # assert name is a string\n    assert isinstance(name, str), \"name must be a string\"\n    num = 5\n    # \"message\" variable should be defined before using it \n    message = \"Hello, \" + name + \"!\"\n    return message"}
{"prompt": "Create a JavaScript function which takes a string and returns the longest word in it.\n\"This is an example string\"", "response

# Debug Dataset Generation

In [None]:
# Create tiny debug subsets to run a quick smoke-test (avoid long runs)
import shutil
from pathlib import Path
p_data = Path(DATA_DIR)
debug_train = p_data / "train_debug.jsonl"
debug_valid = p_data / "valid_debug.jsonl"

def subset(src, dst, n=20):
    with open(src, 'r', encoding='utf-8') as rf, open(dst, 'w', encoding='utf-8') as wf:
        for i, line in enumerate(rf):
            if i >= n:
                break
            wf.write(line)

subset(paths["train"], debug_train, n=20)
subset(paths["valid"], debug_valid, n=10)
print("Debug subsets created:", debug_train, debug_valid)
!wc -l "{debug_train}" "{debug_valid}"


Debug subsets created: /content/drive/MyDrive/Final_Project/Data/train_debug.jsonl /content/drive/MyDrive/Final_Project/Data/valid_debug.jsonl
  20 /content/drive/MyDrive/Final_Project/Data/train_debug.jsonl
  10 /content/drive/MyDrive/Final_Project/Data/valid_debug.jsonl
  30 total


**Training**

In [None]:
# 这个是debug的命令
# 注意：--config_dir 指向你 Drive 下的 Configs 文件夹
'''
!python "{SCRIPTS_DIR}/train_base.py" \
  --config_dir "{CONFIGS_DIR}" \
  --train_file "{DATA_DIR}/train_debug.jsonl" \
  --valid_file "{DATA_DIR}/valid_debug.jsonl" \
  --overwrite_output_dir \
  --num_train_epochs 5 \
  --learning_rate 3e-4 \
  --per_device_train_batch_size 4 \
  --gradient_accumulation_steps 2
  '''

# 这个是正式运行的命令
# 正式训练（按 configs 指定的超参
!python "{SCRIPTS_DIR}/train_base.py" \
  --config_dir "{CONFIGS_DIR}" \
  --train_file "{DATA_DIR}/train.jsonl" \
  --valid_file "{DATA_DIR}/valid.jsonl" \
  --overwrite_output_dir \
  --num_train_epochs 1 \
  --learning_rate 1e-3 \
  --per_device_train_batch_size 8 \
  --metric_for_best_model "eval_loss"

2025-11-14 08:15:35.382384: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763108135.404204    8069 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763108135.410689    8069 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763108135.427498    8069 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763108135.427525    8069 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763108135.427531    8069 computation_placer.cc:177] computation placer alr

In [12]:
!cp -r /content/models/tinyllama_ai_finetuned /content/drive/MyDrive/Final_Project/Models

# Quick Assessment

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os, json

# 根据 training_args.yaml 的 output_dir 确定保存路径，或改成你想要的 models 路径
import yaml
cfg = yaml.safe_load(open(f"{CONFIGS_DIR}/training_args.yaml"))
outdir = cfg.get("output_dir", f"{BASE_DIR}/models/finetuned_model")
print("Expecting model at:", outdir)

tokenizer = AutoTokenizer.from_pretrained(outdir)
model = AutoModelForCausalLM.from_pretrained(outdir, device_map="auto")
gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

# 测试几条 valid
with open(f"{DATA_DIR}/valid.jsonl",'r',encoding='utf-8') as f:
    lines = [json.loads(l) for l in f][:5]

for ex in lines:
    prompt = f"### 问：\n{ex['prompt']}\n### 答：\n"
    out = gen(prompt, max_new_tokens=128, do_sample=False)[0]["generated_text"]
    print("="*40)
    print("PROMPT:", prompt)
    print("OUTPUT:", out)


Expecting model at: models/tinyllama_ai_finetuned


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


PROMPT: ### 问：
Create a function in C to check whether a given string contains any punctuations or not.
### 答：

OUTPUT: ### 问：
Create a function in C to check whether a given string contains any punctuations or not.
### 答：

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

### 答swers

##
PROMPT: ### 问：
Design a function in PHP that takes two strings as input and return true if the strings are an anagram of each other.
string1 = “listen”
string2 = “silent”
### 答：

OUTPUT: ### 问：
Design a function in PHP that takes two strings as input and return true if the strings are an anagram of each other.
string1 = “listen”
string2 = “silent”
### 答：
string1 = string2
string2 = string1
### 答而非
string1 = string2
string2 = string1
### 答而非
string1 = string2
string2 = string1
### 答而非
string1 = string2
string2 = string1
### 答而非
string1 = string2
string2 = string1
### 答而非
string1 = string2
string2 = 

In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import os, json
import torch

# 根据 training_args.yaml 的 output_dir 确定保存路径
import yaml
cfg = yaml.safe_load(open(f"{CONFIGS_DIR}/training_args.yaml"))
outdir = cfg.get("output_dir", f"{BASE_DIR}/models/finetuned_model")
print("Expecting model at:", outdir)

# 加载基础模型
base_model_name = cfg.get("model_name_or_path", "TinyLlama/TinyLlama_v1.1")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

# 加载LoRA适配器
model = PeftModel.from_pretrained(model, outdir)

# 测试几条 valid
with open(f"{DATA_DIR}/valid.jsonl",'r',encoding='utf-8') as f:
    lines = [json.loads(l) for l in f][:5]

for ex in lines:
    prompt = f"问：\n{ex['prompt']}\n 答：\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("="*40)
    print("PROMPT:", prompt)
    print("OUTPUT:", generated_text)
    print("Generated part only:", generated_text[len(prompt):])

Expecting model at: models/tinyllama_ai_finetuned


`torch_dtype` is deprecated! Use `dtype` instead!


PROMPT: 问：
Create a function in C to check whether a given string contains any punctuations or not.
 答：

OUTPUT: 问：
Create a function in C to check whether a given string contains any punctuations or not.
 答：


A: You can use the following code:
#include <stdio.h>
#include <string.h>

int main()
{
    char str[100];
    printf("Enter a string: ");
    scanf("%s", str);
    if (str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.'
Generated part only: 

A: You can use the following code:
#include <stdio.h>
#include <string.h>

int main()
{
    char str[100];
    printf("Enter a string: ");
    scanf("%s", str);
    if (str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.' || str[0] == '.'
PROMPT: 问：
Design a function in PHP that takes two strings as input and return true if the strings are an anagram of each other.
string1 = “listen”
string2 = “sil