In [None]:
import json
import random

random.seed(42)

# 处理 JSONL 文件的函数
def process_jsonl(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        data_list = []
        for line in infile:
            # 加载每一行的 JSON 数据
            data = json.loads(line.strip())
            conversations = data.get("conversations", "")
            
            # filter out multiple conversations
            if len(conversations) > 2:
                continue

            # 处理 conversations 字段
            question = conversations[0]
            answer = conversations[1] if len(conversations) > 1 else ""
            # 创建新的字典
            new_data = {
                "question": question,
                "answer": answer,
            }
            data_list.append(new_data)
        
        # 将处理后的数据写入新的 JSON 文件
        # subset = random.sample(data_list, 150)
        for item in data_list:
            outfile.write(json.dumps(item, ensure_ascii=False) + '\n')

input_file = "/data/miaopeng/workplace/LLaMA-Factory/data/lima/lima_train.jsonl"  # 输入文件路径
output_file = "/data/miaopeng/workplace/LLaMA-Factory/data/lima/lima_train_full.jsonl"  # 输出文件路径

process_jsonl(input_file, output_file)

In [None]:
import json
import os
import torch
import random
from tqdm import tqdm
from transformers import AutoModelForCausalLM
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

random.seed(42)
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

# 模型路径和文件路径
input_file = "/data/miaopeng/workplace/LLaMA-Factory/data/lima/lima_train_full.jsonl"
out_dir = "/data/miaopeng/workplace/LLaMA-Factory/6000Q_Assignment4"

# 初始化Mistral tokenizer
tokenizer = MistralTokenizer.v1()

# 初始化Mistral模型
model = AutoModelForCausalLM.from_pretrained("/data/miaopeng/workplace/LLaMA-Factory/hugging_cache/Mistral-7B-Instruct-v0.2")
model.to("cuda")

# 输出路径
output_dir = os.path.join(out_dir, "infer_results")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "lima_results.jsonl")

# 加载输入数据
all_datas = []
with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        all_datas.append(data)

# random select 50 samples
all_datas = random.sample(all_datas, 50)

# 推理和保存结果
for i in range(0, len(all_datas), 50):
    batch_data = all_datas[i:i + 50]

    pred_result_list = []

    for i in range(5):
        predictions = []
        
        for data in tqdm(batch_data, desc=f"Processing batch {i // 50 + 1}/{len(all_datas) // 50 + 1}"):
            # 构建Mistral prompt
            completion_request = ChatCompletionRequest(messages=[UserMessage(content=data['question'])])
            tokens = tokenizer.encode_chat_completion(completion_request).tokens

            # 使用Mistral模型生成结果
            tokens_tensor = torch.tensor([tokens]).to("cuda")
            generated_ids = model.generate(tokens_tensor, max_new_tokens=1000, do_sample=True)
            # 解码生成的结果
            result = tokenizer.decode(generated_ids[0].tolist())
            predictions.append(result)
        
        pred_result_list.append(predictions)


    # 保存结果到文件
    with open(output_file, 'a', encoding='utf-8') as f:
        for j, data in enumerate(batch_data):
            tmp_pred_list = [pred_result_list[k][j] for k in range(5)]
            # 处理生成的结果
            for m, tmp_prediction in enumerate(tmp_pred_list):
                tmp_prediction = tmp_prediction.split("[/INST]")[-1].strip()
                data[f"prediction_{m}"] = tmp_prediction
            f.write(json.dumps(data) + '\n')

print("Inference completed and results saved.")

2025-04-23 11:29:03.423011: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-23 11:29:03.630564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745378943.709449  649194 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745378943.728855  649194 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745378943.893499  649194 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Processing batch 1/2:   0%|          | 0/50 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Processing batch 1/2:   2%|▏         | 1/50 [00:10<08:55, 10.92s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batch 1/2:   4%|▍         | 2/50 [00:20<07:52,  9.85s/it]The attention mask and the pad token id were not set. As a consequence, you

Inference completed and results saved.





In [1]:
import json
import os
import torch
import random
from tqdm import tqdm
from vllm import LLM, SamplingParams
from transformers import AutoModelForCausalLM
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

random.seed(42)
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

# 模型路径和文件路径
mistral_models_path = "MISTRAL_MODELS_PATH"
input_file = "/data/miaopeng/workplace/LLaMA-Factory/data/lima/lima_train_full.jsonl"
out_dir = "/data/miaopeng/workplace/LLaMA-Factory/6000Q_Assignment4"

# 初始化Mistral tokenizer
tokenizer = MistralTokenizer.v1()

# 初始化Mistral模型
model_name = "/data/miaopeng/workplace/LLaMA-Factory/hugging_cache/Mistral-7B-Instruct-v0.2"
model = LLM(model=model_name, dtype="float16", tensor_parallel_size=1, trust_remote_code=True)
sampling_params = SamplingParams(temperature=0, max_tokens=1000)  # logprobs=1000

# 输出路径
output_dir = os.path.join(out_dir, "infer_results")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "lima_results.jsonl")

# 加载输入数据
all_datas = []
with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        all_datas.append(data)

# random select 50 samples
select_datas = random.sample(all_datas, 50)
select_prompts = []
for data in select_datas:
    ques = data['question']
    completion_request = ChatCompletionRequest(messages=[UserMessage(content=data['question'])])
    select_prompts.append(completion_request)

# 推理和保存结果
for i in range(0, len(all_datas), 50):
    batch_data = all_datas[i:i + 50]
    pred_result_list = []

    for i in range(5):
        predictions = []
        # 生成结果
        result = model.generate(select_prompts, sampling_params=sampling_params)
        
        for num in range(len(result)):
            # 处理生成的结果
            tmp_prediction = output[num].outputs[0].text
            predictions.append(tmp_prediction)
        
        pred_result_list.append(predictions)


    # 保存结果到文件
    with open(output_file, 'a', encoding='utf-8') as f:
        for j, data in enumerate(batch_data):
            tmp_prediction = [x[j] for x in pred_result_list]
            # 处理生成的结果
            for i in range(5):
                # tmp_prediction[i] = tmp_prediction[i].split("[/INST]")[-1].strip()
                data[f"prediction_{i}"] = tmp_prediction[i]
            f.write(json.dumps(data) + '\n')

print("Inference completed and results saved.")

2025-04-21 10:16:41.554831: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-21 10:16:41.576450: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745201801.604816  917015 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745201801.613143  917015 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745201801.630319  917015 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

INFO 04-21 10:16:51 llm_engine.py:232] Initializing an LLM engine (v0.6.1) with config: model='/data/miaopeng/workplace/LLaMA-Factory/hugging_cache/Mistral-7B-Instruct-v0.2', speculative_config=None, tokenizer='/data/miaopeng/workplace/LLaMA-Factory/hugging_cache/Mistral-7B-Instruct-v0.2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/data/miaopeng/work

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 04-21 10:16:57 model_runner.py:1008] Loading model weights took 13.4966 GB
INFO 04-21 10:17:00 gpu_executor.py:122] # GPU blocks: 27444, # CPU blocks: 2048
INFO 04-21 10:17:02 model_runner.py:1309] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-21 10:17:02 model_runner.py:1313] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 04-21 10:17:30 model_runner.py:1428] Graph capturing finished in 28 secs.


ERROR:tornado.general:SEND Error: Host unreachable


AssertionError: Expected code to be unreachable, but got: ChatCompletionRequest(temperature=0.7, top_p=1.0, max_tokens=None, random_seed=None, model=None, mes...