# 下载训练数据 + 预处理

In [1]:
!pip install -q huggingface_hub

In [2]:
from huggingface_hub import hf_hub_download
import zipfile
import os
import shutil

def get_train_zip(name):
  # 下载数据集并且解压到databases文件夹
  dataset_path = hf_hub_download(repo_id="Follow-Lang/set.mm", repo_type="dataset", filename=f"datasets/train/{name}.zip")
  with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall("databases/train/")

def get_folder_size(folder_path):
    total_size = 0
    # os.walk() generates the file names in a directory tree
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            # Join the directory path with the filename to get full file path
            file_path = os.path.join(dirpath, filename)
            # Only add file size if it's a file (skip if it's a symbolic link, etc.)
            if os.path.isfile(file_path):
                total_size += os.path.getsize(file_path) / (1024 * 1024) # Convert bytes to MB
    return total_size

In [3]:

# 删除旧文件夹
if os.path.exists('databases/train'):
  shutil.rmtree('databases/train')

# 下载数据集并且解压到databases文件夹
train_files = ['train_0', 'train_1']

for file in train_files:
  get_train_zip(file)

train_size = get_folder_size("databases/train")

print(f"train data size = {train_size} MB")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train_0.zip:   0%|          | 0.00/95.1M [00:00<?, ?B/s]

train_1.zip:   0%|          | 0.00/120M [00:00<?, ?B/s]

train data size = 3120.514115333557 MB


In [4]:
import os

def get_train_lines(folder_path, cut_length = 1024+2):
    total_lines = 0
    max_length = 0
    n_not_cut = 0
    # os.walk() generates the file names in a directory tree
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            # Join the directory path with the filename to get full file path
            file_path = os.path.join(dirpath, filename)
            # Only count lines if it's a file (skip symbolic links, etc.)
            if os.path.isfile(file_path):
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                      total_lines += 1  # Count the number of lines
                      n = len(line.strip().split(' '))
                      if n <= cut_length:
                        n_not_cut += 1
                      max_length = max(max_length, n)

    return total_lines, max_length, n_not_cut


In [5]:
train_lines, max_length, n_not_cut = get_train_lines("databases/train")
print(f"Total Train Lines: {train_lines}")
print(f"Max Length: {max_length}")
print(f"Number of Not Cut: {n_not_cut}")

Total Train Lines: 4194368
Max Length: 2050
Number of Not Cut: 4156985


In [6]:
from huggingface_hub import hf_hub_download
words_path = hf_hub_download(repo_id="Follow-Lang/set.mm", repo_type="dataset", filename=f"datasets/train/words.txt")
with open(words_path, 'r') as f:
  words = ['<bos>', '<eos>', '<pad>'] + [line.strip() for line in f.readlines()]
word_map = {word: idx for idx, word in enumerate(words)}
print(len(words))
bos = word_map['<bos>']
eos = word_map['<eos>']
pad = word_map['<pad>']
end_of_state = word_map['</state>']
end_of_action = word_map['</action>']
ignore_id = -100

def encode_ids(s: str) -> list[int]:
    return [word_map.get(word, eos) for word in s.split(' ')]

def decode_ids(ids: list[int]) -> str:
    return ' '.join([words[i] if i >= 0 else "<pad>" for i in ids])

datasets/train/words.txt:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

2578


# Train a Model

In [7]:
!pip install -q --upgrade transformers datasets torch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.4/906.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import random
import numpy as np
import torch

# 生成一个随机种子
seed = random.randint(0, 2**32 - 1)

# 设置各个随机数生成器的种子
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# 输出种子，便于复现
print(f"Random seed set to: {seed}")

Random seed set to: 2864798369


In [9]:

from datasets import IterableDataset
import os
import random

max_length = 1024
files_per_batch = 100

# 生成器函数，用于逐行读取每个 .txt 文件
def generate_data_from_files(folder_path):
    txt_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".txt")]
    # 随机打乱文件顺序
    random.shuffle(txt_files)

    cumulate_lines = 0
    file_index = 0
    file_line_index = 0
    file_lines = []

    def load_next_file():
        nonlocal txt_files, file_lines, file_index
        if file_index >= len(txt_files):
            # 循环
            random.shuffle(txt_files)
            file_index = 0
        file_name = txt_files[file_index]
        with open(file_name, 'r', encoding='utf-8') as f:
            file_lines.extend(f.readlines())
        file_index += 1

    def load_next_batch_file():
        nonlocal file_lines, file_line_index
        file_lines = []
        file_line_index = 0
        for _ in range(files_per_batch):
            load_next_file()
        random.shuffle(file_lines)

    load_next_batch_file()

    def handle_line(line):
        if len(line.strip()) == 0:
            return None, None, None
        costs, words = line.strip().split('\t')
        # action_index = words.index('</action>') + len('</action>')
        # words = words[:action_index]
        toks = encode_ids(words)
        n = len(toks)
        if n < max_length:
            toks += [eos] + [pad] * (max_length - n - 1)
        elif n > max_length:
            # toks = toks[:max_length]
            return None, None, None
        # 填充 eos 到指定的 max_length 长度
        index_of_end_of_state = toks.index(end_of_state)
        # 使用 AutoModelForCausalLM 的时候，labels不需要shift，模型内部会shift一下。
        # 但是我使用自定义的loss，需要平移label
        if n < max_length:
            labels = [ignore_id] * (index_of_end_of_state) + toks[index_of_end_of_state+1:n+1] + [ignore_id] * (max_length-n)
            attention_mask = [1] * n + [0] * (max_length-n)
        else:
            labels = [ignore_id] * (index_of_end_of_state) + toks[index_of_end_of_state+1:] + [ignore_id]
            attention_mask = [1] * max_length
        assert len(toks) == len(labels)
        assert len(toks) == len(attention_mask)
        return toks, labels, attention_mask

    def data_generator():
        nonlocal file_line_index, file_lines  # 声明为非局部变量，以便在生成器中修改

        while file_line_index < len(file_lines):
            line = file_lines[file_line_index]
            file_line_index += 1
            toks, labels, attention_mask = handle_line(line)
            if toks is None or len(toks) == 0:
              continue
            yield {'input_ids': toks, 'labels':labels, 'attention_mask':attention_mask}
            if file_line_index >= len(file_lines):
                load_next_batch_file()

    return IterableDataset.from_generator(data_generator)

In [10]:
train_dataset = generate_data_from_files("databases/train")
num_samples = 10  # 定义采样次数
for i, example in enumerate(train_dataset):
    index = example['input_ids'].index(end_of_state)
    print(f"{i} input : {decode_ids(example['input_ids'][:index+1])}")
    print(f"{i} target: {decode_ids(example['input_ids'][index+1:])}")
    print(f"{i} label : {decode_ids(example['labels'][index:])}")
    if i >= num_samples:
        break

0 input : <state> |- wcel cv gs0 cab gs0 wtru |- wceq cvv cab gs0 wtru </state>
0 target: <action> |- wceq cvv cab gs0 wtru </action> <state> |- wcel cv gs0 cab gs0 wtru </state> <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [11]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
from transformers import AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
import os
from datasets import Dataset
import wandb


# 判断是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 指定模型保存的路径
model_name_or_path = "gpt2"
output_dir = f"/content/drive/MyDrive/{model_name_or_path}"

# 初始化 wandb，恢复之前的 run
wandb_project = "gpt2"
wandb_run_id = "full-test-01"
wandb.init(project=wandb_project, resume="allow", id=wandb_run_id)

# 确保 output_dir 文件夹存在
os.makedirs(output_dir, exist_ok=True)

# 查找是否有保存的检查点
checkpoints = [os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint")]
if checkpoints:
    latest_checkpoint = max(checkpoints, key=os.path.getctime)  # 按创建时间排序选择最新的 checkpoint
    print(f"Using checkpoint: {latest_checkpoint}")
    model = AutoModelForCausalLM.from_pretrained(latest_checkpoint, ignore_mismatched_sizes=True)
else:
    latest_checkpoint = None
    print(f"No checkpoint found, using {model_name_or_path}")
    config = AutoConfig.from_pretrained(model_name_or_path, bos_token_id=eos, eos_token_id=eos, pad_token_id=eos, vocab_size=4096)
    print(config)
    model = AutoModelForCausalLM.from_config(config=config)

batch_size = 8
gradient_accumulation_steps = 32
N = train_lines # 2 * 1024 * 1024  # 总数据量
steps_per_epoch = N // (batch_size * gradient_accumulation_steps)
num_epochs = 1  # 训练轮数
# max_steps = num_epochs * steps_per_epoch
max_steps = 1000
print(f"max_steps: {max_steps}")

training_args = TrainingArguments(
    output_dir=output_dir,  # 保存模型到指定路径
    per_device_train_batch_size=batch_size,  # 单个设备的batch size
    logging_dir="./logs",  # 日志保存路径
    logging_steps=10,  # 每 100 步记录一次loss
    evaluation_strategy="no",  # 禁用验证
    save_strategy="steps",  # 每隔一段时间保存模型
    save_steps=10,  # 每 10 步保存一次
    max_steps=max_steps,  # 设置训练总步数为整数
    save_total_limit=3,  # 最多保留3个保存点
    load_best_model_at_end=False,  # 不需要在训练结束时加载最好的模型
    learning_rate=1e-4,  # 初始学习率
    lr_scheduler_type="cosine",  # 使用余弦学习率调度器
    warmup_steps=100,  # warmup 步数为 100
    fp16=True,  # 启用混合精度训练
    resume_from_checkpoint=latest_checkpoint if latest_checkpoint else None,  # 从检查点恢复
    report_to="wandb",  # 禁用wandb和其他报告工具
    gradient_accumulation_steps=gradient_accumulation_steps,  # 累计 16 个batch 的梯度
    weight_decay=0.1,  # 使用 weight decay 防止过拟合
    adam_beta1=0.9,  # Adam 优化器的 beta1 参数
    adam_beta2=0.95,  # Adam 优化器的 beta2 参数
    max_grad_norm=1.0,  # 梯度裁剪，防止梯度爆炸
    run_name=wandb_project
)

class CustomTrainer(Trainer):
    def __init__(self, entropy_weight=0.1, label_pad_token_id=-100, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.entropy_weight = entropy_weight  # 保存熵的权重
        self.label_pad_token_id = label_pad_token_id
        self.loss_fct = torch.nn.CrossEntropyLoss(ignore_index=label_pad_token_id,reduction="sum")
        self.loss_scale = 1.0 / (self.args.gradient_accumulation_steps * self.args.per_device_train_batch_size)

    def compute_loss(self, model, inputs, return_outputs=False, *args, **kwargs):
        # 计算模型输出
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = outputs.logits
        # 计算标准损失
        loss = self.loss_fct(logits.view(-1, logits.size(-1)), inputs['labels'].view(-1)) * self.loss_scale
        # 计算熵
        if self.entropy_weight > 0:
            softmax_logits = torch.nn.functional.softmax(logits, dim=-1)
            log_softmax_logits = torch.nn.functional.log_softmax(logits, dim=-1)
            # 计算每个 token 的熵
            token_entropy = -torch.sum(softmax_logits * log_softmax_logits, dim=-1)
            # 使用 attention_mask 屏蔽无效的熵值
            masked_entropy = token_entropy * inputs['attention_mask']
            entropy = masked_entropy.sum() * self.loss_scale
        else:
            entropy = torch.tensor(0.0).to(logits.device)
        # 结合标准损失和熵
        total_loss = loss - self.entropy_weight * entropy
        return (total_loss, outputs) if return_outputs else total_loss

train_dataset = generate_data_from_files("databases/train")
test_dataset = generate_data_from_files("databases/train")
# 定义Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    entropy_weight=0.001,  # 传递熵权重
    label_pad_token_id=ignore_id
)

In [None]:
import torch

# 计算模型中包含 NaN 值的参数数量
nan_params = sum(torch.isnan(p).sum().item() for p in model.parameters())

print(f"Number of NaN values in model parameters: {nan_params}")

# 原始代码，仍保留参数数量的计算
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params / (1000 * 1000)} M")
print(f"Trainable parameters: {trainable_params / (1000 * 1000)} M")


In [None]:
# 继续训练
# trainer.train(resume_from_checkpoint=True if latest_checkpoint else False)

In [None]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM

# 指定模型保存的路径
model_name_or_path = "gpt2"
output_dir = f"/content/drive/MyDrive/{model_name_or_path}"

# 确保 output_dir 文件夹存在
os.makedirs(output_dir, exist_ok=True)

# 查找是否有保存的检查点
checkpoints = [os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint")]
latest_checkpoint = max(checkpoints, key=os.path.getctime)  # 按创建时间排序选择最新的 checkpoint
print(f"Using checkpoint: {latest_checkpoint}")
config = AutoConfig.from_pretrained(latest_checkpoint)
model = AutoModelForCausalLM.from_pretrained(latest_checkpoint, config=config, ignore_mismatched_sizes=True)
# 将模型设置为评估模式
model.eval()

test_dataset = generate_data_from_files("databases/train")

In [None]:
import torch
from torch.nn import functional as F

num_samples = 10  # 定义采样次数
total_loss = 0.0
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction="sum")  # 忽略 pad 标记的部分

for i, example in enumerate(test_dataset):
    input_ids = example['input_ids']
    labels = example['labels']
    # attention_mask = example['attention_mask']

    # 将数据转换为 Tensor 格式
    idx = input_ids.index(end_of_state)
    # 防止token穿越，完全不传后面的数据
    # input_tensor = torch.tensor(input_ids[:idx+1] + [eos] * (len(input_ids) - idx - 1)).unsqueeze(0)
    input_tensor = torch.tensor(input_ids).unsqueeze(0)
    labels_tensor = torch.tensor(labels).unsqueeze(0)
    attention_mask_tensor = torch.tensor(attention_mask).unsqueeze(0)

    # 使用模型预测
    outputs = model(input_tensor, attention_mask=attention_mask_tensor)
    logits = outputs.logits
    entropy = -torch.sum(
                torch.nn.functional.softmax(logits, dim=-1) *
                torch.nn.functional.log_softmax(logits, dim=-1), dim=-1
            ).sum()

    # 计算损失
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels_tensor.view(-1))
    total_loss += loss.item()

    idx = input_ids.index(end_of_state)
    print(f"Input  {i}: {decode_ids(input_ids[:idx+1])}")
    print(f"Target {i}: {decode_ids(input_ids[idx+1:])}")
    output_ids = torch.argmax(logits, dim=-1).squeeze().tolist()
    print(f"Result {i}: {decode_ids(output_ids[idx+1:])}")
    print(f"Loss {i}: {loss.item()}, Entropy {i}: {entropy.item()}")

    if i >= num_samples:
        break

average_loss = total_loss / (i + 1)
print(f"Average Loss: {average_loss}")

# 从某个模型采样数据


In [None]:
!pip install -q --upgrade transformers datasets torch

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer

model_name = "Salesforce/codegen-350M-multi"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set your input text
input_text = "void usertrapret(void) {"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
print(input_ids)
print(tokenizer.decode(input_ids[0], skip_special_tokens=False))

# Generate text
output = model.generate(
    input_ids,
    max_length=100,   # set to desired length
    num_return_sequences=1,
    do_sample=True,   # enables sampling
    temperature=0.7   # controls creativity
)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
print(generated_text)

In [None]:
input_text = "<state> |- wi ( wn ( vw1 ) , wi ( wceq ( vc3 , cif ( vw1 , vc2 , vc3 ) ) , wb ( vw5 , vw0 ) ) ) -| wi ( wceq ( vc3 , cif ( vw1 , vc2 , vc3 ) ) , wb ( vw5 , vw0 ) ) -| wi ( wi ( wceq ( vc3 , cif ( vw1 , vc2 , vc3 ) ) , wb ( vw5 , vw0 ) ) , wi ( wn ( vw1 ) , wi ( wceq ( vc3 , cif ( vw1 , vc2 , vc3 ) ) , wb ( vw5 , vw0 ) ) ) ) </state> <action> |- wi ( wn ( vw1 ) , wi ( wceq ( vc3 , cif ( vw1 , vc2 , vc3 ) ) , wb ( vw5 , vw0 ) ) ) -| wi ( wceq ( vc3 , cif ( vw1 , vc2 , vc3 ) ) , wb ( vw5 , vw0 ) ) -| wi ( wi ( wceq ( vc3 , cif ( vw1 , vc2 , vc3 ) ) , wb ( vw5 , vw0 ) ) , wi ( wn ( vw1 ) , wi ( wceq ( vc3 , cif ( vw1 , vc2 , vc3 ) ) , wb ( vw5 , vw0 ) ) ) ) </action> <qed> <eos>"

input_ids = tokenizer(input_text, return_tensors="pt").input_ids
print(input_ids)
print(' ; '.join(tokenizer.decode(input_ids[0], skip_special_tokens=False)))
print(len(input_text.split(' ')), input_ids.shape)