# 基础思路（baseline）

## 步骤1：更新或安装所需环境

In [1]:
!pip install --upgrade modelscope requests urllib3 tqdm pandas mindspore mindnlp
!apt update > /dev/null; apt install aria2 git-lfs axel -y > /dev/null
# 华为云modelarts不支持apt-get

Looking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
Collecting modelscope
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/94/34/cc9faf34851d4b75fb94a9c5748ff6c67cc55c1bfb44014e638a2007dc02/modelscope-1.22.3-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting mindspore
  Using cached https://mirrors.cloud.aliyuncs.com/pypi/packages/4c/a1/2fc43ebdc8b6b3c82e0bd22a5771bd6cc595bfcb3d22831d8914334b82e3/mindspore-2.4.10-cp310-cp310-manylinux1_x86_64.whl (972.0 MB)
Installing collected packages: modelscope, mindspore
  Attempting uninstall: modelscope
    Found existing installation: modelscope 1.22.0
    Uninstalling modelscope-1.22.0:
      Successfully uninstalled modelscope-1.22.0
  Attempting uninstall: mindspore
    Found existing installation: mindspore 2.3.1
    Uninstalling mindspore-2.3.1:
      Successfully uninstalled mindspore-2.3.

## 步骤2：下载数据集

In [2]:
!axel -n 12 -a https://ai-contest-static.xfyun.cn/2024/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E8%83%BD%E5%8A%9B%E8%AF%84%E6%B5%8B%EF%BC%9A%E4%B8%AD%E6%96%87%E6%88%90%E8%AF%AD%E9%87%8A%E4%B9%89%E4%B8%8E%E8%A7%A3%E6%9E%90%E6%8C%91%E6%88%98%E8%B5%9B/test_input.csv

Initializing download: https://ai-contest-static.xfyun.cn/2024/%E5%A4%A7%E6%A8%A1%E5%9E%8B%E8%83%BD%E5%8A%9B%E8%AF%84%E6%B5%8B%EF%BC%9A%E4%B8%AD%E6%96%87%E6%88%90%E8%AF%AD%E9%87%8A%E4%B9%89%E4%B8%8E%E8%A7%A3%E6%9E%90%E6%8C%91%E6%88%98%E8%B5%9B/test_input.csv
ERROR 403: Forbidden.


## 步骤3：构建模型（使用Meta-Llama-3-8B-Instruct）

In [None]:
import mindspore
from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "LLM-Research/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, mirror='modelscope')
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    ms_dtype=mindspore.float16,
    mirror='modelscope',
    device_map="auto"
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="ms"
)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=100,
    eos_token_id=terminators,
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
    do_sample=False,
    #length_penalty=1.0,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))

100%|██████████| 49.8k/49.8k [00:00<00:00, 804kB/s]


## 步骤4：读取数据集

In [None]:
import pandas as pd
test = pd.read_csv('./test_input.csv', header=None)

In [None]:
# 查看数据集大小
print(f"数据集的大小为: {test.shape[0]}\n前50条数据如下：\n")

# 查看前50条赛事数据集（赛题要求根据每行句子，给出5个可能匹配的成语）
for test_prompt in test[0].values[:50]:
    print(test_prompt)

## 步骤5：输出成语

In [None]:
from tqdm import tqdm
import os


i = 1
# 假设 test 是一个 DataFrame
# 遍历测试数据集的第一项的值，目的是生成与给定句子最相关的五个成语
for test_prompt in tqdm(test[0].values, total=len(test[0].values), desc="处理进度"):
    i = i + 1
    # 构造提示信息，要求模型输出与句子最相关的五个成语
    prompt = f"列举与下面句子最符合的五个成语。只需要输出五个成语，不需要有其他的输出，写在一行中：{test_prompt}"

    # 初始化一个长度为5的列表，填充默认成语“同舟共济”
    words = ['同舟共济'] * 5

    # 构建聊天消息格式，用于提示模型进行生成
    messages = [
    {"role": "system", "content": "You are a helpful chinese teacher."},
    {"role": "user", "content": f"{prompt}"},
    ]
    # 应用聊天模板对消息进行处理，准备模型输入
    input_ids = tokenizer.apply_chat_template(
           messages,
           add_generation_prompt=True,
           return_tensors="ms"
    )
    # 对输入文本进行编码，准备模型输入数据
    terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    # 生成回答，限制最大生成长度
    outputs = model.generate(
    input_ids,
    max_new_tokens=100,
    eos_token_id=terminators,
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,
    do_sample=False,
    #length_penalty=1.0,
    )
    # 提取模型输出，去除输入部分
    response = outputs[0][input_ids.shape[-1]:]
    
    # 解码模型输出，去除特殊标记
    response = tokenizer.decode(response, skip_special_tokens=True)
    
    # 清理回答文本，确保格式统一
    response = response.replace('\n', ' ').replace('、', ' ')
    # 提取回答中的成语，确保每个成语长度为4且非空
    words = [x for x in response.split() if len(x) == 4 and x.strip() != '']
    
    

    # 如果生成的成语列表长度不满足要求（即20个字符），则使用默认成语列表
   #if len(' '.join(words).strip()) != 24:
       # words = ['同舟共济'] * 5
    while True:
        text = ' '.join(words).strip()
        if len(text) < 24:
            words.append('同舟共济')
        else:
            break

    # 将最终的成语列表写入提交文件
    with open('submit.csv', 'a+', encoding='utf-8') as up:
        up.write(' '.join(words) + '\n')

    
    # 查看阶段性结果
    if i % 50 == 0:
        tqdm.write(f"大模型第{i}次返回的结果是：\n   {response}\n")
        tqdm.write(f"submit.cvs第{i}行输出结果：\n   {words}\n")
    
    # 为了尽快拿到结果，我们暂时仅获得500个结果（如果有时间的话，可以删除这两行）
    if i == 2973:
        break

print('submit.csv 已生成')


from tqdm import tqdm
import logging

# 配置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def generate_idioms(prompt, tokenizer, model, device, max_attempts=5):
    attempts = 0
    while attempts < max_attempts:
        # 构建聊天消息格式，用于提示模型进行生成
        messages = [
            {"role": "user", "content": prompt},
        ]
        # 应用聊天模板对消息进行处理，准备模型输入
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        # 对输入文本进行编码，准备模型输入数据
        model_inputs = tokenizer([text], return_tensors="pt").to(device)
        
        # 生成回答，限制最大生成长度
        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=512
        )
        # 提取模型输出，去除输入部分
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        
        # 解码模型输出，去除特殊标记
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
        # 清理回答文本，确保格式统一
        response = response.replace('\n', ' ').replace('、', ' ')
        # 提取回答中的成语，确保每个成语长度为4且非空
        words = [x for x in response.split() if len(x) == 4 and x.strip() != '']
        
        if len(words) >= 5:
            return words[:5]
        
        attempts += 1
        logging.info(f"生成结果不足5个成语，重新生成（尝试次数：{attempts}）：{' '.join(words)}")
    
    logging.warning("达到最大尝试次数，返回当前结果：{' '.join(words)}")
    return words

def ensure_result_length(words, min_bytes=24, max_attempts=5):
    attempts = 0
    while len(' '.join(words).encode('utf-8')) < min_bytes and attempts < max_attempts:
        logging.info(f"生成结果不足{min_bytes}字节，重新生成（尝试次数：{attempts}）：{' '.join(words)}")
        new_words = generate_idioms(prompt, tokenizer, model, device, max_attempts)
        words.extend(new_words)
        attempts += 1
    
    while len(' '.join(words).encode('utf-8')) > min_bytes:
        words.pop()
    
    return words

i = 1
results = []
# 假设 test 是一个 DataFrame
# 遍历测试数据集的第一项的值，目的是生成与给定句子最相关的五个成语
for test_prompt in tqdm(test[0].values, total=len(test[0].values), desc="处理进度"):
    i += 1
    # 构造提示信息，要求模型输出与句子最相关的五个成语
    prompt = f"列举与下面句子最符合的五个成语。只需要输出五个成语，不需要有其他的输出，写在一行中：{test_prompt}"

    # 生成成语列表
    words = generate_idioms(prompt, tokenizer, model, device)
    
    # 确保生成结果大于24字节
    words = ensure_result_length(words)
    
    results.append(' '.join(words))

    # 查看阶段性结果
    if i % 50 == 0:
        logging.info(f"大模型第{i}次返回的结果是：\n   {' '.join(words)}\n")
        logging.info(f"submit.csv第{i}行输出结果：\n   {words}\n")

    # 为了尽快拿到结果，我们暂时仅获得500个结果（如果有时间的话，可以删除这两行）
    if i == 200:
        break

# 将所有结果一次性写入文件
with open('submit.csv', 'w', encoding='utf-8') as up:
    for result in results:
        up.write(result + '\n')

print('submit.csv 已生成')