In [21]:
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModel
from fastapi.middleware.cors import CORSMiddleware
import uvicorn, json, datetime
import torch
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer

In [22]:
import json

In [23]:
DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE

origins = [
    "http://localhost.tiangolo.com",
    "https://localhost.tiangolo.com",
    "http://localhost",
    "http://localhost:8080",
    "http://localhost:5500",
    "http://120.55.72.74",
    "http://www.aivirtuallover.com",
    "https://www.aivirtuallover.com",
    "http://aivirtuallover.com",
    "https://aivirtuallover.com",
]

In [24]:
def torch_gc():
    if torch.cuda.is_available():
        with torch.cuda.device(CUDA_DEVICE):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()




def load_lora_config(model):
    config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["query_key_value"]
    )
    return get_peft_model(model, config)

PROMPT_PATTERN = "问：{}"
SEP_PATTERN = "\n答： "
def create_prompt(question):
    return PROMPT_PATTERN.format(question), SEP_PATTERN


def create_prompt_ids(tokenizer, question, max_src_length):
    prompt, sep = create_prompt(question)
    sep_ids = tokenizer.encode(
        sep, 
        add_special_tokens = True
    )
    sep_len = len(sep_ids)
    special_tokens_num = 2
    prompt_ids = tokenizer.encode(
        prompt, 
        max_length = max_src_length - (sep_len - special_tokens_num),
        truncation = True,
        add_special_tokens = False
    )

    return prompt_ids + sep_ids


def create_inputs_and_labels(tokenizer, question, answer, device):
    prompt = create_prompt_ids(tokenizer, question, max_src_length)
    completion = tokenizer.encode(
        answer, 
        max_length = max_dst_length,
        truncation = True,
        add_special_tokens = False
    )

    inputs = prompt + completion + [eop]
    labels = [-100] * len(prompt) + completion + [eop] 
    
    inputs = torch.tensor(inputs, dtype=torch.long, device=device)
    labels = torch.tensor(labels, dtype=torch.long, device=device)
    return inputs, labels

def get_attention_mask(tokenizer, input_ids, device):
    seq = input_ids.tolist()
    context_len = seq.index(bos)
    seq_len = len(seq)
    attention_mask = torch.ones((seq_len, seq_len), device=device)
    attention_mask.tril_()
    attention_mask[..., :context_len] = 1
    attention_mask.unsqueeze_(0)
    attention_mask = (attention_mask < 0.5).bool()
    return attention_mask


def get_position_ids(tokenizer, input_ids, device, position_encoding_2d=True):
    seq = input_ids.tolist()
    context_len = seq.index(bos)
    seq_len = len(seq)

    mask_token = mask if mask in seq else gmask
    use_gmask = False if mask in seq else gmask

    mask_position = seq.index(mask_token)

    if position_encoding_2d:
        position_ids = torch.arange(seq_len, dtype=torch.long, device=device)
        if not use_gmask:
            position_ids[context_len:] = mask_position
        block_position_ids = torch.cat((
            torch.zeros(context_len, dtype=torch.long, device=device),
            torch.arange(seq_len - context_len, dtype=torch.long, device=device) + 1
        ))
        position_ids = torch.stack((position_ids, block_position_ids), dim=0)
    else:
        position_ids = torch.arange(seq_len, dtype=torch.long, device=device)
        if not use_gmask:
            position_ids[context_len:] = mask_position
    
    return position_ids

class QADataset(Dataset):
    def __init__(self, data, tokenizer) -> None:
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
 

    def __getitem__(self, index):
        item_data = self.data[index]
        tokenizer = self.tokenizer
        input_ids, labels = create_inputs_and_labels(
            tokenizer, 
            device=device,
            **item_data
        )
        
        attention_mask = get_attention_mask(tokenizer, input_ids, device)
        position_ids = get_position_ids(tokenizer, input_ids, device)

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
            "position_ids": position_ids
        }
        

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    input_ids = []
    attention_mask = []
    labels = []
    position_ids = []
    
    for obj in batch:
        input_ids.append(obj['input_ids'])
        labels.append(obj['labels'])
        attention_mask.append(obj['attention_mask'])
        position_ids.append(obj['position_ids'])
        
    return {
        'input_ids': torch.stack(input_ids),
        'attention_mask': torch.stack(attention_mask), 
        'labels': torch.stack(labels),
        'position_ids':torch.stack(position_ids)
    }

class ModifiedTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        return model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            position_ids=inputs["position_ids"],
            labels=inputs["labels"],
        ).loss


In [26]:
revision = "096f3de6b4959ce38bef7bb05f3129c931a3084e"    
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", revision=revision, trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm-6b", revision=revision, trust_remote_code=True).half().cuda()
model = load_lora_config(model)
bos = tokenizer.bos_token_id
eop = tokenizer.eop_token_id
pad = tokenizer.pad_token_id
mask = tokenizer.mask_token_id
gmask = tokenizer.sp_tokenizer[tokenizer.gMASK_token]
device = "cuda"
max_src_length = 200
max_dst_length = 500
training_args = TrainingArguments(
    "output",
    fp16 =True,
    save_steps = 500,
    save_total_limit = 3,
    gradient_accumulation_steps=1,
    per_device_train_batch_size = 1,
    learning_rate = 1e-4,
    max_steps=1000,
    logging_steps=50,
    remove_unused_columns=False,
    seed=0,
    data_seed=0,
    group_by_length=False,
    dataloader_pin_memory=False
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [27]:
with open('test.txt', 'r') as f:
    json_data = f.read()
    f.close()

train_data = json.loads(json_data)
train_dataset = QADataset(train_data, tokenizer=tokenizer)
trainer = ModifiedTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    data_collator=collate_fn,
    tokenizer=tokenizer
)

In [29]:
!nvidia-smi

Sat Aug 12 16:04:14 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:00:07.0 Off |                    0 |
| N/A   36C    P0              54W / 300W |  12254MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [31]:
model.to(device);

In [32]:
!nvidia-smi

Sat Aug 12 16:04:28 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:00:07.0 Off |                    0 |
| N/A   36C    P0              54W / 300W |  12254MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [33]:
trainer.train()



Step,Training Loss
50,2.5071
100,2.25
150,1.7279
200,2.0135
250,1.8158
300,1.7702
350,1.7247
400,1.649
450,1.6707
500,1.559


TrainOutput(global_step=1000, training_loss=1.5718250617980958, metrics={'train_runtime': 147.4049, 'train_samples_per_second': 6.784, 'train_steps_per_second': 6.784, 'total_flos': 2413781812838400.0, 'train_loss': 1.5718250617980958, 'epoch': 2.12})

In [None]:
微调占用显存：22592MiB

In [34]:
model = model.half().cuda()
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ChatGLMForConditionalGeneration(
      (transformer): ChatGLMModel(
        (word_embeddings): Embedding(150528, 4096)
        (layers): ModuleList(
          (0-27): 28 x GLMBlock(
            (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attention): SelfAttention(
              (rotary_emb): RotaryEmbedding()
              (query_key_value): Linear(
                in_features=4096, out_features=12288, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=12288, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embeddi

In [None]:
eval后占用显存：22592MiB

In [39]:
async def create_item(request: Request):
    global model, tokenizer
    json_post_raw = request
    json_post = json.dumps(json_post_raw)
    json_post_list = json.loads(json_post)
    prompt = json_post_list.get('prompt')
    history = json_post_list.get('history')
    max_length = json_post_list.get('max_length')
    top_p = json_post_list.get('top_p')
    temperature = json_post_list.get('temperature')
    response, history = model.chat(tokenizer,
                                   prompt,
                                   history=history,
                                   max_length=max_length if max_length else 2048,
                                   top_p=top_p if top_p else 0.7,
                                   temperature=temperature if temperature else 0.95)
    now = datetime.datetime.now()
    time = now.strftime("%Y-%m-%d %H:%M:%S")
    answer = {
        "response": response,
        "history": history,
        "status": 200,
        "time": time
    }
    log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'
    print(log)
    torch_gc()
    return answer

In [40]:
# 调用函数示例
request_data = {
    "prompt": "你好",
    "history": [],
    "max_length": 1024,
    "top_p": 0.7,
    "temperature": 0.95
}

response = await create_item(request_data)
print(response)

[2023-08-12 16:13:07] ", prompt:"你好", response:"'你好！请问有什么需要帮助的吗？'"
{'response': '你好！请问有什么需要帮助的吗？', 'history': [('你好', '你好！请问有什么需要帮助的吗？')], 'status': 200, 'time': '2023-08-12 16:13:07'}


释放显存后：12416MiB

In [42]:
# 调用函数示例
request_data = {
    "prompt": "中国古代文化中的民间故事有哪些经典作品?",
    "history": [],
    "max_length": 1024,
    "top_p": 0.7,
    "temperature": 0.95
}

response = await create_item(request_data)
print(response)

[2023-08-12 16:14:34] ", prompt:"中国古代文化中的民间故事有哪些经典作品?", response:"'中国古代文化中有很多经典的民间故事，比如《白蛇传》、《西游记》、《水浒传》、《红楼梦》等。这些故事流传广泛，被广泛传颂和改编，成为中国文化的重要组成部分。这些故事讲述了许多英雄人物的传奇经历，体现了中国人的智慧和精神追求。你最喜欢的中国古代民间故事是什么？'"
{'response': '中国古代文化中有很多经典的民间故事，比如《白蛇传》、《西游记》、《水浒传》、《红楼梦》等。这些故事流传广泛，被广泛传颂和改编，成为中国文化的重要组成部分。这些故事讲述了许多英雄人物的传奇经历，体现了中国人的智慧和精神追求。你最喜欢的中国古代民间故事是什么？', 'history': [('中国古代文化中的民间故事有哪些经典作品?', '中国古代文化中有很多经典的民间故事，比如《白蛇传》、《西游记》、《水浒传》、《红楼梦》等。这些故事流传广泛，被广泛传颂和改编，成为中国文化的重要组成部分。这些故事讲述了许多英雄人物的传奇经历，体现了中国人的智慧和精神追求。你最喜欢的中国古代民间故事是什么？')], 'status': 200, 'time': '2023-08-12 16:14:34'}
