In [1]:
!pip install trl



In [5]:
!pip install datasets

Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl (547 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.16-py38-none-any.whl (132 kB)
Collecting pyarrow-hotfix
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting xxhash
  Using cached xxhash-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Collecting dill<0.3.9,>=0.3.0
  Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Collecting requests>=2.32.2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 1.1 MB/s eta 0:00:01
[?25hCollecting pandas
  Using cached pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
Collecting pyarrow>=15.0.0
  Using cached pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.0 MB)
Collecting tzdata>=2022.1
  Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: dill, multiprocess, pyarrow-hotfix, xxhash, 

In [6]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 972 kB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [7]:
!pip install peft

Collecting peft
  Using cached peft-0.11.1-py3-none-any.whl (251 kB)
Installing collected packages: peft
Successfully installed peft-0.11.1


In [8]:
!pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 1.3 MB/s ta 0:00:01
Collecting protobuf>=3.20
  Downloading protobuf-5.27.2-cp38-abi3-manylinux2014_x86_64.whl (309 kB)
[K     |████████████████████████████████| 309 kB 5.4 MB/s eta 0:00:01
[?25hInstalling collected packages: protobuf, tensorboardX
Successfully installed protobuf-5.27.2 tensorboardX-2.6.2.2


### Preliminary Preparation

In [None]:
import json

# Load the JSON file
with open('/root/TRL2/dev.jsonl', 'r', encoding='utf-8-sig') as file:
    # Open the output file in write mode
    with open('/root/TRL2/dev_mod.jsonl', 'w', encoding='utf-8-sig') as outfile:
        for line in file:
            item = json.loads(line)
            for i in range(10):
                # Create a new dictionary for each item
                new_item = {'weibo':item['weibo'], 'resp':item['resp'][i]}
                # Dump the new item directly into the file
                json.dump(new_item, outfile, ensure_ascii=False)
                # Write a newline character after each item
                outfile.write('\n')

In [1]:
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

## SFT Phase

In [1]:
from datasets import load_dataset

dataset = load_dataset("gen_sft_dataset.py", trust_remote_code=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['weibo', 'resp'],
        num_rows: 25140
    })
    validation: Dataset({
        features: ['weibo', 'resp'],
        num_rows: 8670
    })
})

In [2]:
print(dataset["train"][0])

{'weibo': '#WTT冠军赛布达佩斯站#\xa0男单1/4决赛林高远3-0宇田幸矢11-3，11-4，11-7', 'resp': '别把我帅死林高远一直这么坚定下去吧！！！！别有太大压力，战胜自己就够了！！！！我永远相信小林将军'}


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True).half().cuda()

Setting eos_token is not supported, use the default one.
Setting pad_token is not supported, use the default one.
Setting unk_token is not supported, use the default one.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [4]:
from peft import LoraConfig, get_peft_model

# Apply LoRA
peft_config = LoraConfig(
    task_type="CAUSAL_LM", 
    r=8,
    lora_alpha=32, 
    lora_dropout=0.1
)
model = get_peft_model(model, peft_config)

In [5]:
tokenizer("冠军赛布达佩斯站")

{'input_ids': [64790, 64792, 30910, 33232, 55001, 54901, 54880, 56313, 54927, 54957], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'position_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}

In [7]:
# max_input_length = 128
# max_target_length = 32

# def preprocess_function(examples):
#     inputs = examples["weibo"]
#     model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

#     # Setup the tokenizer for targets
#     labels = tokenizer(text_target=examples["resp"], max_length=max_target_length, truncation=True)

#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

import torch

max_input_length = 64
max_target_length = 64

def preprocess_function(examples):
    inputs = [doc for doc in examples["weibo"]]
    # print(inputs)
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length') # Very severe bug!

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["resp"], max_length=max_target_length, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]

    # Print out the shapes of your input and labels tensors
    print(f'Input shape: {torch.tensor(model_inputs["input_ids"]).shape}')
    print(f'Labels shape: {torch.tensor(model_inputs["labels"]).shape}')
    
    return model_inputs

In [8]:
preprocess_function(dataset['train'][:2])

Input shape: torch.Size([2, 64])
Labels shape: torch.Size([2, 64])


{'input_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64790, 64792, 1126, 30959, 16937, 33232, 55001, 54901, 54880, 56313, 54927, 54957, 31010, 31045, 55033, 54838, 30939, 30967, 30972, 34054, 54973, 54589, 55151, 30966, 30941, 30940, 55773, 55416, 55397, 57953, 30939, 30939, 30941, 30966, 31123, 30939, 30939, 30941, 30972, 31123, 30939, 30939, 30941, 30981], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64790, 64792, 1126, 30959, 16937, 33232, 55001, 54901, 54880, 56313, 54927, 54957, 31010, 31045, 55033, 54838, 30939, 30967, 30972, 34054, 54973, 54589, 55151, 30966, 30941, 30940, 55773, 55416, 55397, 57953, 30939, 30939, 30941, 30966, 31123, 30939, 30939, 30941, 30972, 31123, 30939, 30939, 30941, 30981]], 'attention_mask': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['weibo', 'resp', 'input_ids', 'attention_mask', 'position_ids', 'labels'],
        num_rows: 25140
    })
    validation: Dataset({
        features: ['weibo', 'resp', 'input_ids', 'attention_mask', 'position_ids', 'labels'],
        num_rows: 8670
    })
})

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:
# # from trl import SFTConfig, SFTTrainer

# # def formatting_func(example):
# #     return [f"weibo: {example['weibo']}\nresp: {example['resp']}"]

# from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# args = Seq2SeqTrainingArguments(
#     # dataset_text_field="resp",
#     # max_seq_length=32,
#     output_dir="./model",
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     evaluation_strategy = "epoch",
#     learning_rate=1.5e-5,
#     num_train_epochs=5,
#     logging_dir = "./results/weibo1",
#     report_to = "tensorboard",
# )
# trainer = Seq2SeqTrainer(
#     model,
#     args,
#     tokenizer = tokenizer,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     # data_collator = data_collator,    
#     # args=sft_config,
#     # formatting_func=formatting_func,
# )



In [14]:
from trl import SFTConfig, SFTTrainer

# def formatting_func(example):
#     return [f"weibo: {example['weibo']}\nresp: {example['resp']}"]

sft_config = SFTConfig(
    dataset_text_field="resp",
    max_seq_length=32,
    output_dir="./SFT_Model_Real",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy = "epoch",
    learning_rate=1.5e-4,
    num_train_epochs=5,
    logging_dir = "./results/weibo111",
    report_to = "tensorboard",
    gradient_accumulation_steps = 8,
)

trainer = SFTTrainer(
    model,
    tokenizer = tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator = data_collator,    
    args=sft_config,
    # formatting_func=formatting_func,
)



Map:   0%|          | 0/25140 [00:00<?, ? examples/s]

Map:   0%|          | 0/8670 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [20]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

show_random_elements(dataset["train"])

Unnamed: 0,weibo,resp
0,#南航机务人员因说机长再见被通报#？官方回应：已注意到相关信息近日，网上流传一张标题为“CZ8847航前出港情况报告”的截图，据截图内容显示，报告中提到两位机务人员主要存在两个问题，一是机务人员没有及时通知机长启动发动机，二是机务人员存在用语不规范问题，本来应该按标准用语跟机长说“机长，飞行愉快，一路平安，再见”，结果只说了“机长再见”。截图内容引起众多网友特别是民航业内人士的热议。据航旅纵横信息显示，CZ8847航班由中国南方航空公司承运，5月2日上午九点航班由北京大兴机场飞往成都双流机场，执飞飞机机型为空客320-200，编号B6977，与网传截图信息一致。5月6日晚，一位在南方航空公司工作的资深人士告诉新黄河记者，“这个通告我后来看到了，因为涉及公司，不方便评论”。5月7日，南方航空公司客服人员告诉记者，已经注意到相关信息，目前没有调查结果反馈，不了解具体细节，建议等官方渠道公布的调查结果为准。,形式主义
1,无奋斗不青春“勇敢不是不怕，而是不逃不退让。什么是热爱？是第一万次重新再来……”青春就该肆意张扬，追逐心中的光。今天是五四青年节，这首《赤脚追光》，送给所有逆风跑、不逃不退让的你～#奋斗青春100年##当代年轻人在职场有多勇#L中国消防,勇敢不是不怕，而是不逃不退让！致敬全体消防员！也感谢对R1SE——《赤脚追光》这首歌的喜欢
2,《仙剑三》古筝笛子钢琴合奏版，太好听了瞬间上头#仙剑三古筝笛子钢琴合奏版##微博喜剧现场#L文科班校花酱,这首歌让我想起我当年在蜀山练剑的日子
3,#两男子抬近300斤新娘进婚礼现场#目击者：感觉非常吃力1月1日，湖北枣阳一婚礼现场，一新娘从婚车内出来准备进入婚礼现场时，由两名男子一路抬着。刚没走几步，一名男子累得大叫，几名男子赶紧上前帮忙。目击者王先生称，目测新娘体重300斤左右，两男子一人抬一条腿非常吃力，旁边还有两名男子扶着生怕摔跤。有人一直鼓励两名男子牙咬紧点，坚持一下，感觉给婚礼平添了不少欢乐和喜庆。#蓝v视界#,拜金男冲着钱去的吧 现在的男人啊越来越拜金了 都爱吃软饭（一个猜想不一定对
4,美国疾控中心：#近六成美国人曾感染新冠病毒#根据美国约翰斯·霍普金斯大学的数据，美国目前报告的累计新冠肺炎确诊病例超过8108万例。而根据美国疾控中心当地时间4月26日发布的最新报告，美国大约已有58%的人口具有因感染过新冠病毒而产生的抗体——以美国人口约为3.3亿来计算的话，也就是说，美国已有大约1.914亿人感染过新冠病毒。,超疾大国


In [None]:
# batch = data_collator([tokenized_datasets["train"][i] for i in range(5)])
# print(batch)

In [15]:
trainer.train()

AttributeError: 'NoneType' object has no attribute 'to'

## RM Phase
10:17 12:11

*Error Came Across*:

1. `AssertionError`: Cannot handle batch sizes > 1 if no padding token is defined.

In [2]:
rm_dataset = load_dataset("gen_rm_dataset.py", trust_remote_code=True)
rm_dataset

DatasetDict({
    train: Dataset({
        features: ['weibo', 'text_j', 'text_k', 'text_j_like', 'text_k_like'],
        num_rows: 9985
    })
    validation: Dataset({
        features: ['weibo', 'text_j', 'text_k', 'text_j_like', 'text_k_like'],
        num_rows: 3451
    })
    test: Dataset({
        features: ['weibo', 'text_j', 'text_k', 'text_j_like', 'text_k_like'],
        num_rows: 7249
    })
})

In [3]:
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for text_j, text_k, text_j_like, text_k_like in zip(examples["text_j"], examples["text_k"], examples["text_j_like"], examples["text_k_like"]):
        if text_j_like >= text_k_like:
            chosen = text_j
            rejected = text_k
        else:
            chosen = text_k
            rejected = text_j        
        
        tokenized_chosen = tokenizer(chosen, padding="max_length", max_length=32, truncation=True) # Same Problem as SFT
        tokenized_rejected = tokenizer(rejected, padding="max_length", max_length=32, truncation=True) # Same Problem as SFT

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Initial Reward Model
rm_model = AutoModelForSequenceClassification.from_pretrained("gpt2")

# Initial Tokenizer for RM
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
rm_model.config.pad_token_id = tokenizer.pad_token_id # Suggested by Claude 3.5 Sonnet

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Preprocess the dataset
rm_dataset = rm_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)

In [6]:
rm_train_dataset = rm_dataset["train"]
rm_eval_dataset = rm_dataset["validation"]

In [9]:
from trl import RewardTrainer, RewardConfig

config = RewardConfig(
    output_dir="RM_model7",
    per_device_train_batch_size=32,
    num_train_epochs=150,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-3,
    report_to="tensorboard",
    logging_dir="./results/rm7-Latest7",
    remove_unused_columns=False,
    optim="adamw_torch",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=500,
    max_length=256,
)

In [10]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", # NOT CAUSAL_LM (bug)
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [10]:
### First Formal Trial (1560 steps) bs = 8, lr ~ 5e-5, 10 epochs

In [19]:
trainer = RewardTrainer(
    model=rm_model,
    tokenizer=tokenizer,
    args=config,
    train_dataset=rm_train_dataset,
    eval_dataset=rm_eval_dataset,
    peft_config=peft_config,
)
trainer.train()
trainer.save_model(config.output_dir)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Accuracy
500,0.7246,0.705777,0.509128
1000,0.6919,0.690434,0.538974
1500,0.7041,0.688446,0.541292










In [None]:
### Second Formal Trial bs = 16, lr = 2e-3, 40 epochs

In [22]:
trainer = RewardTrainer(
    model=rm_model,
    tokenizer=tokenizer,
    args=config,
    train_dataset=rm_train_dataset,
    eval_dataset=rm_eval_dataset,
    peft_config=peft_config,
)
trainer.train()
trainer.save_model(config.output_dir)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
500,0.6925,0.70663,0.548826
1000,0.6971,0.708805,0.549406
1500,0.6927,0.711551,0.542741
2000,0.7057,0.70641,0.542162
2500,0.6784,0.709342,0.545059
3000,0.685,0.727206,0.545639
3500,0.6841,0.718486,0.540133
4000,0.6799,0.706571,0.548247
4500,0.668,0.701843,0.551434
5000,0.683,0.70061,0.555781


























In [None]:
### Third Formal Trial (Failed, Gradient Explosion) bs = 32, lr = 2e-2, 150 epochs, g_c = 1

In [33]:
trainer = RewardTrainer(
    model=rm_model,
    tokenizer=tokenizer,
    args=config,
    train_dataset=rm_train_dataset,
    eval_dataset=rm_eval_dataset,
    peft_config=peft_config,
)
trainer.train()
trainer.save_model(config.output_dir)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Accuracy
500,0.7587,0.691762,0.525065
1000,0.73,0.692003,0.532889
1500,0.0,,1.0
2000,0.0,,1.0






NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf fou

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf fou

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.

KeyboardInterrupt



In [11]:
### Seventh Formal Trial

In [12]:
trainer = RewardTrainer(
    model=rm_model,
    tokenizer=tokenizer,
    args=config,
    train_dataset=rm_train_dataset,
    eval_dataset=rm_eval_dataset,
    peft_config=peft_config,
)
trainer.train()
trainer.save_model(config.output_dir)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
500,0.6979,0.716391,0.541582
1000,0.6989,0.70338,0.55636
1500,0.7011,0.712082,0.551145
2000,0.6832,0.71182,0.548826
2500,0.6915,0.708923,0.549696
3000,0.6848,0.73357,0.527673
3500,0.6769,0.704638,0.565054
4000,0.6648,0.705384,0.553463
4500,0.6934,0.705815,0.55665
5000,0.7038,0.712788,0.543611
















































In [20]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
print(metrics)





***** eval metrics *****
  epoch                   =      9.992
  eval_accuracy           =      0.541
  eval_loss               =     0.6884
  eval_runtime            = 0:00:08.94
  eval_samples_per_second =    385.626
  eval_steps_per_second   =     48.273
{'eval_loss': 0.6884294748306274, 'eval_accuracy': 0.5410026079397277, 'eval_runtime': 8.9491, 'eval_samples_per_second': 385.626, 'eval_steps_per_second': 48.273, 'epoch': 9.9919935948759}




## PPO Phase

In [1]:
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

### Configuration

In [2]:
ppo_config = PPOConfig(
    model_name="SFT_Model_T5-Small",
    learning_rate=1.41e-5,
    # log_with="wandb",
)

In [3]:
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 1}

### Load Dataset

In [4]:
from datasets import load_dataset

ppo_dataset = load_dataset("gen_sft_dataset.py", trust_remote_code=True) # gen_ppo_dataset
ppo_dataset

DatasetDict({
    train: Dataset({
        features: ['weibo', 'resp'],
        num_rows: 25140
    })
    validation: Dataset({
        features: ['weibo', 'resp'],
        num_rows: 8670
    })
})

In [5]:
ppo_dataset["train"]["resp"][1]

'不用呼吸机光速下班'

### Load Pre-trained SFT Model

In [6]:
# from transformers import T5ForConditionalGeneration, T5Tokenizer
from trl import AutoModelForSeq2SeqLMWithValueHead
from transformers import AutoTokenizer

In [7]:
# model = AutoModelForCausalLMWithValueHead.from_pretrained("./SFT_Model_T5-Small")
# ref_model = AutoModelForCausalLMWithValueHead.from_pretrained("./SFT_Model_T5-Small")
# tokenizer = AutoTokenizer.from_pretrained("./SFT_Model_T5-Small")

model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained("./SFT_Model_T5-Small/checkpoint-15720")
ref_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained("./SFT_Model_T5-Small/checkpoint-15720")
tokenizer = AutoTokenizer.from_pretrained("./SFT_Model_T5-Small/checkpoint-15720")

tokenizer.pad_token = tokenizer.eos_token

In [8]:
def tokenize(sample):
    # sample["input_ids"] = tokenizer.encode(sample["weibo"])
    sample = tokenizer(sample["weibo"])
    sample["query"] = tokenizer.decode(sample["input_ids"])
    return sample

ppo_train_dataset = ppo_dataset["train"]
ppo_train_dataset = ppo_train_dataset.map(tokenize) # Bug Fixed (Not In-place)
ppo_train_dataset.set_format(type="torch") # Important! Turn into tensors 

In [9]:
ppo_train_dataset

Dataset({
    features: ['weibo', 'resp', 'input_ids', 'attention_mask', 'query'],
    num_rows: 25140
})

In [10]:
print(ppo_train_dataset[0]['input_ids'])

tensor([ 1713,   518,  9697,     2,  4663,     3,     2,   536, 13572,     2,
        22773,     2,  2596,  3486,     6,  2596,  4278,     6,  2596,  6832,
            1])


### Initialize PPOTrainer 

In [11]:
# from transformers import DataCollatorForSeq2Seq

# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [12]:
ppo_trainer = PPOTrainer(ppo_config, model, ref_model, tokenizer, dataset=ppo_train_dataset, data_collator=collator)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


### Load Tuned Reward Model Trained in RM Phase

In [13]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

rm_model = AutoModelForSequenceClassification.from_pretrained("./RM_model/checkpoint-1560")
rm_tokenizer = AutoTokenizer.from_pretrained("./RM_model/checkpoint-1560")
rm_tokenizer.pad_token = tokenizer.eos_token
# GPT
rm_tokenizer.padding_side = "right"  # This ensures consistent padding
rm_model.config.pad_token_id = rm_tokenizer.pad_token_id

sentiment_pipe = pipeline("sentiment-analysis", model=rm_model, tokenizer=rm_tokenizer, padding=True, truncation=True, max_length=32)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


### Generation Settings 

In [14]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}

### Training Loop for Model Optimization

In [15]:
from tqdm import tqdm

In [16]:
ppo_trainer.dataloader

<accelerate.data_loader.DataLoaderShard at 0x7f39df041040>

In [17]:
print(ppo_train_dataset[:5])

{'weibo': ['#WTT冠军赛布达佩斯站#\xa0男单1/4决赛林高远3-0宇田幸矢11-3，11-4，11-7', '#WTT冠军赛布达佩斯站#\xa0男单1/4决赛林高远3-0宇田幸矢11-3，11-4，11-7', '#WTT冠军赛布达佩斯站#\xa0男单1/4决赛林高远3-0宇田幸矢11-3，11-4，11-7', '#WTT冠军赛布达佩斯站#\xa0男单1/4决赛林高远3-0宇田幸矢11-3，11-4，11-7', '#WTT冠军赛布达佩斯站#\xa0男单1/4决赛林高远3-0宇田幸矢11-3，11-4，11-7'], 'resp': ['别把我帅死林高远一直这么坚定下去吧！！！！别有太大压力，战胜自己就够了！！！！我永远相信小林将军', '不用呼吸机光速下班', '“人家肯定拼你的”，“对，所以说这个是被拼的时候，自己要扛住压力这个事。”今天发球落点速度节奏变化都好棒，让大家欣赏一下速度流明天的半决赛将迎战本站第一个右手，放平心态，你谁都不怕！', '“坚定都溢出来了”“怎么可以防的如此自如啊”“非常自信/冷静啊”…没有不可治愈的伤痛，没有不可结束的沉沦。所有失去的，会以另一种方式归来。林高远，好样的，这场打的真漂亮，继续加油，愈战愈勇！', '“全方位的碾压让对方看不到一丝希望！”林高远牛逼！是意气风发的小林将军林高远继续加油！！！！'], 'input_ids': tensor([[ 1713,   518,  9697,     2,  4663,     3,     2,   536, 13572,     2,
         22773,     2,  2596,  3486,     6,  2596,  4278,     6,  2596,  6832,
             1],
        [ 1713,   518,  9697,     2,  4663,     3,     2,   536, 13572,     2,
         22773,     2,  2596,  3486,     6,  2596,  4278,     6,  2596,  6832,
             1],
        [ 1713,   518,  9697,     2,  4663,   

In [18]:
import torch

In [19]:
# from trl.trainer import PPOTrainer

# class DebugPPOTrainer(PPOTrainer):
#     def compute_rewards(self, scores, logprobs, ref_logprobs, masks):
#         print(f"Mask: {masks}")  # Print the mask tensor
#         return super().compute_rewards(scores, logprobs, ref_logprobs, masks)

In [20]:
# output_min_length = 4
# output_max_length = 16
# output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 0.7,
    "temperature": 0.95, # This one is set implicitly in SFTTrainer as well.
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)): # Need the specific type of data_collator
    print(f"\nStart Epoch {epoch}")
    # print(batch)
    query_tensors = batch["input_ids"]
    attention_masks = batch["attention_mask"]
    # attention_masks = [query_tensors != tokenizer.pad_token_id]
    # print(attention_masks)

    #### Get response from gpt2
    response_tensors = []
    for i, query in enumerate(query_tensors):
        # gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = 32
        # print(attention_masks[i])
        response = ppo_trainer.generate(query, **generation_kwargs, attention_mask=attention_masks[i].unsqueeze(0))
        response_tensors.append(response.squeeze()[-32:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    print(f"Rewards: {rewards}")

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

0it [00:00, ?it/s]


Start Epoch 0




Rewards: [tensor(0.3247), tensor(-0.4279), tensor(1.0311), tensor(1.3849), tensor(1.3149), tensor(0.7697), tensor(-0.0726), tensor(0.3974), tensor(0.8744), tensor(0.6990), tensor(0.7375), tensor(0.8521), tensor(1.3748), tensor(0.6267), tensor(0.4745), tensor(1.5654), tensor(1.2332), tensor(1.1041), tensor(0.6691), tensor(1.4607), tensor(0.8209), tensor(1.9314), tensor(1.0623), tensor(0.4732), tensor(0.2149), tensor(0.2790), tensor(1.1716), tensor(0.4938), tensor(0.8852), tensor(0.5833), tensor(1.0516), tensor(0.6871), tensor(0.2867), tensor(0.9077), tensor(1.0171), tensor(0.9477), tensor(0.6717), tensor(1.2082), tensor(1.0935), tensor(0.2660), tensor(2.1056), tensor(1.3770), tensor(1.4741), tensor(0.4571), tensor(1.0620), tensor(1.1657), tensor(0.4745), tensor(0.3918), tensor(0.6624), tensor(0.6110), tensor(0.4825), tensor(0.4429), tensor(1.5289), tensor(0.3845), tensor(1.3412), tensor(2.0855), tensor(1.0573), tensor(0.6064), tensor(1.1378), tensor(2.2293), tensor(1.4243), tensor(1.368

1it [01:14, 74.69s/it]


Start Epoch 1
Rewards: [tensor(1.6634), tensor(0.8338), tensor(0.8952), tensor(0.8450), tensor(1.1459), tensor(1.0398), tensor(1.3422), tensor(0.0995), tensor(1.0808), tensor(1.0666), tensor(1.3784), tensor(0.5546), tensor(1.1670), tensor(0.4264), tensor(0.8036), tensor(0.1371), tensor(0.4910), tensor(0.7096), tensor(0.4893), tensor(0.6423), tensor(0.5254), tensor(0.5462), tensor(0.1426), tensor(1.0184), tensor(0.2400), tensor(0.6576), tensor(0.7996), tensor(1.1127), tensor(1.1917), tensor(1.0329), tensor(0.4501), tensor(0.2414), tensor(0.4160), tensor(1.1485), tensor(0.4391), tensor(0.6793), tensor(0.5285), tensor(2.0681), tensor(1.4336), tensor(0.2664), tensor(1.4794), tensor(1.0813), tensor(1.0680), tensor(0.0994), tensor(1.1784), tensor(0.5999), tensor(0.6752), tensor(1.1913), tensor(0.3548), tensor(0.4093), tensor(0.8390), tensor(-0.0387), tensor(0.6551), tensor(0.5912), tensor(-1.9159), tensor(0.9960), tensor(-1.0077), tensor(0.9505), tensor(1.0042), tensor(1.1887), tensor(0.481

1it [02:14, 134.58s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 138.00 MiB. GPU 