In [78]:
import re
import pandas as pd
from docx import Document
import json

def read_docx(filename):
    """读取docx文件内容"""
    doc = Document(filename)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# 读取四个文档
fill_text = read_docx('1_tiankong.docx')
choice_text = read_docx('2_xuanze.docx')
judge_text = read_docx('3_panduan.docx')
program_text = read_docx('4_biancheng.docx')

def parse_fill_questions(text):
    """解析填空题 - 直接解析带##标记的答案"""
    questions = []
    
    # 匹配格式：数字\. 题目内容 (包含**##答案##**)
    # 使用更灵活的正则表达式匹配不同格式
    pattern = r'(\d+)(?:\\\.|\.)\s*(.*?)(?=\n\d+(?:\\\.|\.)|\Z)'
    matches = re.findall(pattern, text, re.DOTALL)
    
    for idx, content in matches:
        # 清理内容，去掉多余空格和换行
        content = re.sub(r'\s+', ' ', content).strip()
        
        # 提取所有**##答案##**标记的内容
        answer_matches = re.findall(r'\*\*##(.*?)##\*\*', content)
        
        # 将题目中的**##答案##**替换为下划线
        question_text = re.sub(r'\*\*##.*?##\*\*', '______', content)
        
        # 如果有多个空，答案用逗号分隔
        answer = ','.join(answer_matches) if answer_matches else ''
        
        questions.append({
            'id': int(idx),
            'question': question_text.strip(),
            'answer': answer,
            'type': 'fill'
        })
    
    return pd.DataFrame(questions)

def parse_choice_questions(text):
    """解析选择题"""
    questions = []
    # 改进后的正则表达式，更灵活地匹配题目格式
    pattern = r'(\d+)\.\s+(.*?)\s*A\.(.*?)\s*B\.(.*?)\s*C\.(.*?)\s*D\.(.*?)\s*答案：\s*([A-D])'
    matches = re.findall(pattern, text, re.DOTALL)
    
    for idx, question, a, b, c, d, answer in matches:
        # 清理题目和选项中的多余空格和换行符
        question = re.sub(r'\s+', ' ', question).strip()
        a = re.sub(r'\s+', ' ', a).strip()
        b = re.sub(r'\s+', ' ', b).strip()
        c = re.sub(r'\s+', ' ', c).strip()
        d = re.sub(r'\s+', ' ', d).strip()
        
        questions.append({
            'id': int(idx),
            'question': question,
            'options': {
                'A': a,
                'B': b,
                'C': c,
                'D': d
            },
            'answer': answer.strip(),
            'type': 'choice'
        })
    
    return pd.DataFrame(questions)

def parse_judge_questions(text):
    """解析判断题"""
    questions = []
    # 匹配判断题
    lines = text.strip().split('\n')
    
    for line in lines:
        if not line.strip():
            continue
        
        # 匹配格式：1. 内容 ( √ ) 或 1. 内容 ( × )
        match = re.match(r'(\d+)\.\s+(.*?)\s*\(\s*([√×])\s*\)', line)
        if match:
            idx, question, answer = match.groups()
            questions.append({
                'id': int(idx),
                'question': question.strip(),
                'answer': '√' if answer == '√' else '×',
                'type': 'judge'
            })
    
    return pd.DataFrame(questions)

def parse_program_questions(text):
    """解析编程题 - 使用####作为分隔符"""
    questions = []
    
    # 使用正则表达式匹配####数字.的格式
    pattern = r'####\s*(\d+)\.\s*(.*?)(?=####\s*\d+\.|\Z)'
    matches = re.findall(pattern, text, re.DOTALL)
    
    for idx, content in matches:
        try:
            # 提取题目名称/标题（第一行）
            title = content.split('\n')[0].strip()
            
            # 提取题目描述
            question_desc = ""
            question_match = re.search(r'题目[：:](.*?)(?=评分标准|参考答案|$)', content, re.DOTALL)
            if question_match:
                question_desc = question_match.group(1).strip()
            else:
                question_desc = re.split(r'\n\s*\n', content)[0].strip()
            
            # 提取参考答案
            answer_code = ""
            answer_match = re.search(r'参考答案[：:](.*?)(?=评分标准|$)', content, re.DOTALL)
            if answer_match:
                answer_code = answer_match.group(1).strip()
            
            questions.append({
                'id': int(idx),
                'title': title,
                'question': question_desc,
                'answer': answer_code,
                'type': 'program'
            })
            
        except Exception as e:
            continue
    
    return pd.DataFrame(questions)

# 解析所有题目
print("正在解析填空题...")
fill_df = parse_fill_questions(fill_text)
print(f"找到{len(fill_df)}道填空题")

print("正在解析选择题...")
choice_df = parse_choice_questions(choice_text)
print(f"找到{len(choice_df)}道选择题")

print("正在解析判断题...")
judge_df = parse_judge_questions(judge_text)
print(f"找到{len(judge_df)}道判断题")

print("正在解析编程题...")
program_df = parse_program_questions(program_text)
print(f"找到{len(program_df)}道编程题")

# 保存为CSV文件
fill_df.to_csv('fill_questions.csv', index=False, encoding='utf-8-sig')
choice_df.to_csv('choice_questions.csv', index=False, encoding='utf-8-sig')
judge_df.to_csv('judge_questions.csv', index=False, encoding='utf-8-sig')
program_df.to_csv('program_questions.csv', index=False, encoding='utf-8-sig')

print("数据处理完成！CSV文件已保存。")

# 也可以将所有题目合并保存为JSON
all_questions = {
    'fill': fill_df.to_dict('records'),
    'choice': choice_df.to_dict('records'),
    'judge': judge_df.to_dict('records'),
    'program': program_df.to_dict('records')
}

with open('all_questions.json', 'w', encoding='utf-8') as f:
    json.dump(all_questions, f, ensure_ascii=False, indent=2)

print("所有题目已保存为JSON文件：all_questions.json")

正在解析填空题...
找到40道填空题
正在解析选择题...
找到90道选择题
正在解析判断题...
找到40道判断题
正在解析编程题...
找到15道编程题
数据处理完成！CSV文件已保存。
所有题目已保存为JSON文件：all_questions.json
