In [4]:
import re

def split_media_tags(content):
    # 用正则表达式匹配 <media数字>
    # (\d+) 捕获组用来提取数字
    pattern = r'<media(\d+)>'
    
    # 用 split 方法分割文本,同时保留匹配到的数字
    # re.split 会返回一个列表,包含分割后的文本和匹配到的捕获组
    parts = re.split(pattern, content)
    
    # 处理结果列表,将数字转换为整型
    result = []
    for i, part in enumerate(parts):
        if i % 2 == 0:  # 偶数索引是文本
            result.append(part)
        else:  # 奇数索引是数字
            result.append(int(part))
            
    return result

In [6]:
split_media_tags('<media1>world<media2>!')  # ['hello', 1, 'world', 2, '!']

['', 1, 'world', 2, '!']

In [7]:
import re

def split_media_tags(content):
    # 匹配 [类型数字] 格式的标签
    # 捕获组 1 捕获类型 (audio|video|img)
    # 捕获组 2 捕获数字
    pattern = r'\[(audio|video|img)(\d+)\]'
    
    # 用 finditer 找到所有匹配
    matches = list(re.finditer(pattern, content))
    if not matches:
        return [content]
    
    result = []
    last_end = 0
    
    for match in matches:
        # 添加标签之前的文本（如果有）
        if match.start() > last_end:
            result.append(content[last_end:match.start()])
            
        # 添加标签信息为元组 (类型, 数字)
        media_type = match.group(1)  # audio, video 或 img
        media_num = int(match.group(2))  # 数字
        result.append((media_type, media_num))
        
        last_end = match.end()
    
    # 添加最后一个标签之后的文本（如果有）
    if last_end < len(content):
        result.append(content[last_end:])
    
    return result

# 测试
test_cases = [
    "Select the instrument represented in images that corresponds to the audio [audio1] from [img1] [img2] [img3] [img4].",
    "[video1] is a great video",
    "Compare [img1] and [img2]",
    "Listen to [audio1] and watch [video1]",
    "[img1] at the beginning and [img2] at the end",
]

for test in test_cases:
    print(f"Input: {test}")
    print(f"Output: {split_media_tags(test)}\n")

Input: Select the instrument represented in images that corresponds to the audio [audio1] from [img1] [img2] [img3] [img4].
Output: ['Select the instrument represented in images that corresponds to the audio ', ('audio', 1), ' from ', ('img', 1), ' ', ('img', 2), ' ', ('img', 3), ' ', ('img', 4), '.']

Input: [video1] is a great video
Output: [('video', 1), ' is a great video']

Input: Compare [img1] and [img2]
Output: ['Compare ', ('img', 1), ' and ', ('img', 2)]

Input: Listen to [audio1] and watch [video1]
Output: ['Listen to ', ('audio', 1), ' and watch ', ('video', 1)]

Input: [img1] at the beginning and [img2] at the end
Output: [('img', 1), ' at the beginning and ', ('img', 2), ' at the end']

