In [1]:
# !pip install python-docx --user

In [2]:
import os
import re
import itertools
import operator
from functools import reduce
import pandas as pd
import docx
from docx import Document
from docx.enum.text import WD_COLOR_INDEX, WD_ALIGN_PARAGRAPH, WD_LINE_SPACING
from docx.oxml.ns import qn
from docx.shared import Cm, Pt  #加入可調整的 word 單位
from docx.shared import RGBColor
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.table import WD_ALIGN_VERTICAL

# Bio轉換Word函數

In [3]:
# Python內建filter()函式 - 過濾list
# filter()把傳入的函式依次作用於每個元素，然後根據返回值是True還是False決定保留還是丟棄該元素
def not_empty(s):
    return s and s.strip()

In [4]:
# 條款順序建置
def clause_index():
    num_dict = {
        0: '',
        1: '一',
        2: '二',
        3: '三',
        4: '四',
        5: '五',
        6: '六',
        7: '七',
        8: '八',
        9: '九',
    }

    clause_list = list()
    clause_dict = dict()
    schedule_list = list()
    schedule_dict = dict()
    index_list = list()

    for i in range(100):
        if i < 10:
            if i != 0:
                clause_list.append("第"+num_dict[i]+"條")
                schedule_list.append("附表"+num_dict[i])
                index_list.append(i)
        elif i < 20:
            index = i - 10
            #print('十' + num_dict[index])
            clause_list.append("第十"+num_dict[index]+"條")
            schedule_list.append("附表十"+num_dict[index])
            index_list.append(i)
        elif i < 30:
            index = i - 20
            #print('二十' + num_dict[index])
            clause_list.append("第二十"+num_dict[index]+"條")
            schedule_list.append("附表二十"+num_dict[index])
            index_list.append(i)
        elif i < 40:
            index = i - 30
            #print('三十' + num_dict[index])
            clause_list.append("第三十"+num_dict[index]+"條")
            schedule_list.append("附表三十"+num_dict[index])
            index_list.append(i)
        elif i < 50:
            index = i - 40
            #print('四十' + num_dict[index])
            clause_list.append("第四十"+num_dict[index]+"條")
            schedule_list.append("附表四十"+num_dict[index])
            index_list.append(i)
        elif i < 60:
            index = i - 50
            #print('五十' + num_dict[index])
            clause_list.append("第五十"+num_dict[index]+"條")
            schedule_list.append("附表五十"+num_dict[index])
            index_list.append(i)
        elif i < 70:
            index = i - 60
            #print('六十' + num_dict[index])
            clause_list.append("第六十"+num_dict[index]+"條")
            schedule_list.append("附表六十"+num_dict[index])
            index_list.append(i)
        elif i < 80:
            index = i - 70
            #print('七十' + num_dict[index])
            clause_list.append("第七十"+num_dict[index]+"條")
            schedule_list.append("附表七十"+num_dict[index])
            index_list.append(i)
        elif i < 90:
            index = i - 80
            #print('八十' + num_dict[index])
            clause_list.append("第八十"+num_dict[index]+"條")
            schedule_list.append("附表八十"+num_dict[index])
            index_list.append(i)
        elif i < 100:
            index = i - 90
            #print('九十' + num_dict[index])
            clause_list.append("第九十"+num_dict[index]+"條")
            schedule_list.append("附表九十"+num_dict[index])
            index_list.append(i)
        else:
            pass

    clause_dict = dict(zip(clause_list, index_list))
    schedule_dict = dict(zip(schedule_list, index_list))
    return clause_dict, schedule_dict

### highlight格式
* AUTO = 'default'
* BLACK = 'black'
* BLUE = 'blue'
* BRIGHT_GREEN = 'green'
* DARK_BLUE = 'darkBlue'
* DARK_RED = 'darkRed'
* DARK_YELLOW = 'darkYellow'
* GRAY_25 = 'lightGray'
* GRAY_50 = 'darkGray'
* GREEN = 'darkGreen'
* PINK = 'magenta'
* RED = 'red'
* TEAL = 'darkCyan'
* TURQUOISE = 'cyan'
* VIOLET = 'darkMagenta'
* WHITE = 'white'
* YELLOW = 'yellow'

In [5]:
# 建立超連結
def add_hyperlink(paragraph, url, text, color, underline, fill_color, font_style, chinese_font_style, font_size, bookmarks):
    """
    A function that places a hyperlink within a paragraph object.

    :param paragraph: The paragraph we are adding the hyperlink to.
    :param url: A string containing the required url
    :param text: The text displayed for the url
    :return: The hyperlink object
    """

    # This gets access to the document.xml.rels file and gets a new relation id value
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a w:r element
    new_run = docx.oxml.shared.OxmlElement('w:r')

    # Create a new w:rPr element
    rPr = docx.oxml.shared.OxmlElement('w:rPr')

    # 超連結字型顏色
    if not color is None:
        c = docx.oxml.shared.OxmlElement('w:color')
        c.set(docx.oxml.shared.qn('w:val'), color)
        rPr.append(c)
        
    # 超連結highlight樣式
    if not fill_color is None:
        b = docx.oxml.shared.OxmlElement('w:highlight')
        b.set(docx.oxml.shared.qn('w:val'), fill_color)
        rPr.append(b)

    # 超連結底線樣式
    if not underline:
        u = docx.oxml.shared.OxmlElement('w:u')
        u.set(docx.oxml.shared.qn('w:val'), 'none')
        rPr.append(u)
    
    # 超連結英文字型樣式
    if not chinese_font_style is None:
        fst = docx.oxml.shared.OxmlElement('w:rFonts')
        fst.set(docx.oxml.shared.qn('w:ascii'), font_style)
        rPr.append(fst)
        
    # 超連結中文字型樣式
    if not chinese_font_style is None:
        cfst = docx.oxml.shared.OxmlElement('w:rFonts')
        cfst.set(docx.oxml.shared.qn('w:eastAsia'), chinese_font_style)
        rPr.append(cfst)
    
    # 超連結字型大小
    if not font_size is None:
        fsi = docx.oxml.shared.OxmlElement('w:sz')
        fsi.set(docx.oxml.shared.qn('w:val'), font_size)
        rPr.append(fsi)

    # Join all the xml elements together add add the required text to the w:r element
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    paragraph._p.append(hyperlink)

    return paragraph

In [6]:
# 建立書籤
def add_bookmark(paragraph, bookmark_text, bookmark_name):
    """
    A function that places a hyperlink within a paragraph object.

    :param paragraph: 創立一個段落
    :param bookmark_text: 定位到書籤文本會被插入到文件中
    :param bookmark_name: 書籤名稱
    """
    run = paragraph.add_run()
    tag = run._r  # for reference the following also works: tag =  document.element.xpath('//w:r')[-1]
    start = docx.oxml.shared.OxmlElement('w:bookmarkStart')
    start.set(docx.oxml.ns.qn('w:id'), '0')
    start.set(docx.oxml.ns.qn('w:name'), bookmark_name)
    tag.append(start)

#     text = docx.oxml.OxmlElement('w:r')
#     text.text = bookmark_text
#     tag.append(text)

    end = docx.oxml.shared.OxmlElement('w:bookmarkEnd')
    end.set(docx.oxml.ns.qn('w:id'), '0')
    end.set(docx.oxml.ns.qn('w:name'), bookmark_name)
    tag.append(end)
    
    return paragraph

In [7]:
# 資料欄位統計
def data_statistics(raw_data, label_list):
    Heading_count = 0
    item_count = 0
    currency_count = 0
    benefit_count = 0
    parameter_count = 0
    rate_count = 0
    other_count = 0

    for index in range(len(raw_data)):
        if (raw_data[index][0] != '\n'):
            if (raw_data[index][0] != 'o'):
                if (label_list[label_list.index(raw_data[index][0])].split('-')[0] == 'B' or
                    label_list[label_list.index(raw_data[index][0])].split('-')[0] == 'I'):
                    label = label_list[label_list.index(raw_data[index][0])].split('-')[-1]
                    if label == 'Heading':
                        Heading_count += 1
                    if label == 'item':
                        item_count += 1
                    if label == 'currency':
                        currency_count += 1
                    if label == 'benefit':
                        benefit_count += 1
                    if label == 'parameter':
                        parameter_count += 1
                    if label == 'rate':
                        rate_count += 1
            else:
                other_count += 1
                    

    statistics = {
        'Heading':Heading_count,
        'item':item_count,
        'currency':currency_count,
        'benefit':benefit_count,
        'parameter':parameter_count,
        'rate':rate_count,
        'other':other_count
    }
    
    return statistics

In [8]:
# 資料轉換(bio->list)
def bio_to_list(raw_data, label_list):
    data = list()
    text = ''
    text_num = 0
    heading_index = 0
    for index in range(len(raw_data)):
        if (raw_data[index][0] != '\n'):
            if (raw_data[index][0] != 'o'):
                if label_list[label_list.index(raw_data[index][0])].split('-')[0] == 'B':
                    if raw_data[index+1][0] != '\n':
                        now_label = label_list[label_list.index(raw_data[index][0])].split('-')[-1]
                        next_label = label_list[label_list.index(raw_data[index+1][0])].split('-')[-1]

                    if raw_data[index+1][0] == 'o' or raw_data[index+1][0] == '\n' or now_label != next_label:
                        label = label_list[label_list.index(raw_data[index][0])].split('-')[-1]
                        #中文使用
                        text += ('<' + label + '>' + raw_data[index][1] + '</' + label + '>')
                        #英文使用
                        #text += ('<' + label + '>' + raw_data[index][1] + ' ' + '<' + label + '>')
                        if label == 'Heading' and heading_index == 0:
                            data.append(text)
                            text = ''
                            heading_index += 1
                    else:
                        label = label_list[label_list.index(raw_data[index][0])].split('-')[-1]
                        #中文使用
                        text += ('<' + label + '>' + raw_data[index][1])
                        #英文使用
                        #text += ('<' + label + '>' + raw_data[index][1] + ' ')
                elif (raw_data[index+1][0] == 'o' or raw_data[index+1][0] == '\n' or
                      label_list[label_list.index(raw_data[index+1][0])].split('-')[0] == 'B'):
                    label = label_list[label_list.index(raw_data[index][0])].split('-')[-1]
                    #中文使用
                    text += (raw_data[index][1] + '</' + label + '>')
                    #英文使用
                    #text += (raw_data[index][1] + '<' + label + '>')
                    if label == 'Heading' and heading_index == 0:
                        data.append(text)
                        text = ''
                        heading_index += 1
                else:
                    #中文使用
                    text += (raw_data[index][1])
                    #英文使用
                    #text += (raw_data[index][1] + ' ')
            else:
                #中文使用
                text += (raw_data[index][1])
                #英文使用
                #text += (raw_data[index][1] + ' ')
        else:
            data.append(text)
            text = ''
            heading_index = 0
    
    return data

In [9]:
# 抓label資料
def catch_label(data):
    bookmarks = list() #書籤(Heading)資料
    currency = list() #currency資料
    benefit = list() #benefit資料
    parameter = list() #parameter資料
    rate = list() #rate資料
    hyperlinks = list() #超連結(item)資料
    
    for index in range(len(data)):
        if (data[index].find('<Heading>') > -1 or data[index].find('<item>') > -1 or data[index].find('<currency>') > -1 or
            data[index].find('<benefit>') > -1 or data[index].find('<parameter>') > -1 or data[index].find('<rate>') > -1):
            #書籤(Heading)資料
            bookmark_list = re.findall('<Heading>[【】\W\w\s]+</Heading>',data[index])
            for bookmark_index in range(len(bookmark_list)):
                bookmark_list[bookmark_index] = re.split('<Heading>|</Heading>',bookmark_list[bookmark_index])
                bookmark_list[bookmark_index] = list(filter(not_empty, bookmark_list[bookmark_index]))
                #print(type(bookmark_list[bookmark_index]), len(bookmark_list[bookmark_index]), bookmark_list[bookmark_index])
                if len(bookmark_list[bookmark_index]) > 0:
                    for row in bookmark_list[bookmark_index]:
                        if len(row) > 2 and len(row) < 6:
                            bookmark_list[bookmark_index] = [row]
            bookmark_list = list(itertools.chain.from_iterable(bookmark_list))
            for bookmark_index in range(len(bookmark_list)):
                bookmark_list[bookmark_index] = ''.join(bookmark_list[bookmark_index].split())
            #print(bookmark_list)
            bookmarks.extend(bookmark_list)
            
            #超連結(item)資料
            hyperlink_list = re.findall('<item>[\W\w\s]+</item>',data[index])
            for hyperlink_index in range(len(hyperlink_list)):
                hyperlink_list[hyperlink_index] = re.split('<item>|</item>',hyperlink_list[hyperlink_index])
                hyperlink_list[hyperlink_index] = list(filter(not_empty, hyperlink_list[hyperlink_index]))
                if len(hyperlink_list[hyperlink_index]) > 0:
                    for row in hyperlink_list[hyperlink_index]:
                        tmp_data_1 = re.findall('第[\W\w\s]+條', row)
                        tmp_data_2 = re.findall('附表[\W\w\s]+', row)
                        if row not in hyperlinks:
                            if len(tmp_data_1) > 0 or len(tmp_data_2) > 0:
                                hyperlinks.append(row)

            #currency資料
            currency_list = re.findall('<currency>[\W\w\s]+</currency>',data[index])
            for currency_index in range(len(currency_list)):
                currency_list[currency_index] = re.split('<currency>|</currency>',currency_list[currency_index])
                currency_list[currency_index] = list(filter(not_empty, currency_list[currency_index]))
                if len(currency_list[currency_index]):
                    for row in currency_list[currency_index]:
                        if (len(re.findall('[，：<>、「」（）\W]+', row))==0) and (row not in currency):
                            currency.append(row)

            #benefit資料
            benefit_list = re.findall('<benefit>[\W\w\s]+</benefit>',data[index])
            for benefit_index in range(len(benefit_list)):
                benefit_list[benefit_index] = re.split('<benefit>|</benefit>',benefit_list[benefit_index])
                benefit_list[benefit_index] = list(filter(not_empty, benefit_list[benefit_index]))
                if len(benefit_list[benefit_index]):
                    for row in benefit_list[benefit_index]:
                        if (len(re.findall('[，：<>、「」（）\W]+', row))==0) and (len(row) > 2) and (row not in benefit):
                            benefit.append(row)

            #parameter資料
            parameter_list = re.findall('<parameter>[\W\w\s]+</parameter>',data[index])
            for parameter_index in range(len(parameter_list)):
                parameter_list[parameter_index] = re.split('<parameter>|</parameter>',parameter_list[parameter_index])
                parameter_list[parameter_index] = list(filter(not_empty, parameter_list[parameter_index]))
                if len(parameter_list[parameter_index]):
                    for row in parameter_list[parameter_index]:
                        if (len(re.findall('[，：<>、「」（）\W]+', row))==0) and (len(row) > 2) and (row not in parameter):
                            parameter.append(row)
            
            #rate資料
            rate_list = re.findall('<rate>[\W\w\s\d.%]+</rate>',data[index])
            for rate_index in range(len(rate_list)):
                rate_list[rate_index] = re.split('<rate>|</rate>',rate_list[rate_index])
                rate_list[rate_index] = list(filter(not_empty, rate_list[rate_index]))
                if len(rate_list[rate_index]):
                    for row in rate_list[rate_index]:
                        if (len(re.findall('[，：<>、「」（）\W]+', row))==0) and (row not in rate):
                            rate.append(row)
    
    return bookmarks, currency, benefit, parameter, rate, hyperlinks

In [10]:
# 資料轉換(list->word)
def list_to_word(file_name, data, statistics, bookmarks, currency, benefit, parameter, rate, hyperlinks):
    font_sizes = 14
    bookmarks_data = bookmarks.copy() #書籤(Heading)資料 => 資料會被移除
    check_heading = False #判斷是否為Heading
    label_kind = ["Heading", "item", "currency", "benefit", "parameter", "rate"] #Label種類
    
    # label統計表
    statistics_df = pd.DataFrame({
        "標籤名稱":list(statistics.keys()),
        "標籤樣式":["Heading", "item", "currency", "benefit", "parameter", "rate", "other"],
        "總字數":list(statistics.values())
    })

    doc = Document()
    
    #放入統計表資料
    p = doc.add_paragraph('')
    p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run('標籤統計資料')
    run.font.name = 'Times New Roman'
    run._element.rPr.rFonts.set(qn('w:eastAsia'), u'標楷體')
    run.font.size = Pt(14)
    
    t = doc.add_table(statistics_df.shape[0]+1, statistics_df.shape[1], style="Medium List 1")
    
    for j in range(statistics_df.shape[-1]):
        #t.cell(0,j).text = statistics_df.columns[j]
        t.cell(0,j).paragraphs[0].paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
        t.cell(0,j).vertical_alignment = WD_ALIGN_VERTICAL.CENTER
        run = t.cell(0,j).paragraphs[0].add_run(statistics_df.columns[j])
        run.font.name = 'Times New Roman'
        run._element.rPr.rFonts.set(qn('w:eastAsia'), u'標楷體')
        run.font.size = Pt(12)
        
    for i in range(statistics_df.shape[0]):
        for j in range(statistics_df.shape[-1]):
            #t.cell(i+1,j).text = str(statistics_df.values[i,j])
            t.cell(i+1,j).paragraphs[0].paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
            t.cell(i+1,j).vertical_alignment = WD_ALIGN_VERTICAL.CENTER
            run = t.cell(i+1,j).paragraphs[0].add_run(str(statistics_df.values[i,j]))
            run.font.name = 'Times New Roman'
            run._element.rPr.rFonts.set(qn('w:eastAsia'), u'標楷體')
            run.font.size = Pt(12)
            if str(statistics_df.values[i,j]) == "Heading" and j == 1:
                run.font.highlight_color = WD_COLOR_INDEX.YELLOW
            if str(statistics_df.values[i,j]) == "item" and j == 1:
                run.font.highlight_color = WD_COLOR_INDEX.BLACK
            if str(statistics_df.values[i,j]) == "currency" and j == 1:
                run.font.highlight_color = WD_COLOR_INDEX.BRIGHT_GREEN
            if str(statistics_df.values[i,j]) == "benefit" and j == 1:
                run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
            if str(statistics_df.values[i,j]) == "parameter" and j == 1:
                run.font.highlight_color = WD_COLOR_INDEX.DARK_BLUE
            if str(statistics_df.values[i,j]) == "rate" and j == 1:
                run.font.highlight_color = WD_COLOR_INDEX.TURQUOISE
            if statistics_df.values[i,j] in label_kind and j == 1:
                run.font.color.rgb = RGBColor(231, 76, 60)
                
    
    doc.add_paragraph('')
    
    #放入條款內容
    for index in range(len(data)):
        if (data[index].find('<Heading>') > -1 or data[index].find('<item>') > -1 or data[index].find('<currency>') > -1 or
            data[index].find('<benefit>') > -1 or data[index].find('<parameter>') > -1 or data[index].find('<rate>') > -1):

            #原始資料整理
            data[index] = re.split('<item>|</item>|<Heading>|</Heading>|'+
                                   '<currency>|</currency>|<benefit>|</benefit>|'+
                                   '<parameter>|</parameter>|<rate>|</rate>',data[index])
            data[index] = list(filter(not_empty, data[index]))

            #合併資料
            p = doc.add_paragraph('')
            check_heading = False
            for row_index in range(len(data[index])):
                if (data[index][row_index] in hyperlinks):
                    tmp_data = re.findall('【[\W\w\s]+】',data[index-1][0])
                    if ((check_heading == True) or (len(tmp_data) == 0)) and data[index][row_index] in bookmarks:
                        p = add_hyperlink(p, '#'+data[index][row_index], data[index][row_index], '#E74C3C', True, 'black',
                                      'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                        #run.font.highlight_color = WD_COLOR_INDEX.BLUE
                        #run.font.color.rgb = RGBColor(231, 76, 60)
                        run = p.add_run('')
                    elif ((check_heading == True) or (len(tmp_data) == 0)) and '【'+data[index][row_index]+'】' in bookmarks:
                        p = add_hyperlink(p, '#【'+data[index][row_index]+'】', data[index][row_index], '#E74C3C', True, 'black',
                                      'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                    else:
                        run = p.add_run(data[index][row_index])
                elif (data[index][row_index] in benefit):
                    tmp_data_1 = '【'+data[index][row_index]+'(並加計利息)的申請】'
                    tmp_data_2 = '【'+data[index][row_index]+'（並加計利息）的申請】'
                    tmp_data_3 = '【'+data[index][row_index]+'(並加計利息)的申領】'
                    tmp_data_4 = '【'+data[index][row_index]+'（並加計利息）的申領】'
                    tmp_data_5 = '【'+data[index][row_index]+'的申請】'
                    tmp_data_6 = '【'+data[index][row_index]+'的申領】'
                    
                    if tmp_data_1 in bookmarks:
                        p = add_hyperlink(p, '#'+tmp_data_1, data[index][row_index], '#E74C3C', True,
                                          'lightGray', 'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                    elif tmp_data_2 in bookmarks:
                        p = add_hyperlink(p, '#'+tmp_data_2, data[index][row_index], '#E74C3C', True,
                                          'lightGray', 'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                    elif tmp_data_3 in bookmarks:
                        p = add_hyperlink(p, '#'+tmp_data_3, data[index][row_index], '#E74C3C', True,
                                          'lightGray', 'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                    elif tmp_data_4 in bookmarks:
                        p = add_hyperlink(p, '#'+tmp_data_4, data[index][row_index], '#E74C3C', True,
                                          'lightGray', 'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                    elif tmp_data_5 in bookmarks:
                        p = add_hyperlink(p, '#'+tmp_data_5, data[index][row_index], '#E74C3C', True,
                                          'lightGray', 'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                    else:
                        p = add_hyperlink(p, '#'+tmp_data_6, data[index][row_index], '#E74C3C', True,
                                          'lightGray', 'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
#                     p = add_hyperlink(p, '#【'+data[index][row_index]+'的申領】', data[index][row_index], '#E74C3C', True,
#                                       'lightGray', 'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                    run = p.add_run('')
                else:
                    tmp_data_all = list()
                    tmp_data_1 = re.findall('【[\W\w\s]+的申領】',data[index][row_index])
                    tmp_data_all.extend(tmp_data_1)
                    tmp_data_2 = re.findall('【[\W\w\s]+的申請】',data[index][row_index])
                    tmp_data_all.extend(tmp_data_2)
                    tmp_data_3 = re.findall('【[\W\w\s]+的扣除】',data[index][row_index])
                    tmp_data_all.extend(tmp_data_3)
                    if (len(tmp_data_all) > 0) and (data[index][row_index] in bookmarks_data):
                        p = add_hyperlink(p, '#【保險給付】', data[index][row_index], '#E74C3C', True,
                                          'yellow', 'Times New Roman', '標楷體', str(font_sizes * 2), bookmarks)
                        run = p.add_run('')
                    else:
                        run = p.add_run(data[index][row_index])

                if (data[index][row_index] in bookmarks_data):
                    tmp_data_1 = re.findall('第[\W\w\s]+條',data[index][row_index])
                    tmp_data_2 = re.findall('【[\W\w\s]+】',data[index][row_index])
                    if len(tmp_data_1) > 0:
                        tmp_data_1 = re.findall('【[\W\w\s]+】',data[index-1][0])
                        if (len(tmp_data_1) > 0) and (check_heading == False):
                            p = add_bookmark(paragraph= p, bookmark_text="", bookmark_name=data[index][row_index])
                            #run = p.add_run(data[index][row_index])
                            run.font.highlight_color = WD_COLOR_INDEX.YELLOW
                            run.font.color.rgb = RGBColor(231, 76, 60)
                            bookmarks_data.remove(data[index][row_index])
                            check_heading = True
                    
                    if len(tmp_data_2) > 0:
                        p = add_bookmark(paragraph= p, bookmark_text="", bookmark_name=data[index][row_index])
                        #run = p.add_run(data[index][row_index])
                        run.font.highlight_color = WD_COLOR_INDEX.YELLOW
                        run.font.color.rgb = RGBColor(231, 76, 60)
                        bookmarks_data.remove(data[index][row_index])  

                if data[index][row_index] in currency:
                    run.font.highlight_color = WD_COLOR_INDEX.BRIGHT_GREEN
                    run.font.color.rgb = RGBColor(231, 76, 60)

#                 if data[index][row_index] in benefit:
#                     run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
#                     run.font.color.rgb = RGBColor(231, 76, 60)

                if data[index][row_index] in parameter:
                    run.font.highlight_color = WD_COLOR_INDEX.DARK_BLUE
                    run.font.color.rgb = RGBColor(231, 76, 60)
                    
                if data[index][row_index] in rate:
                    run.font.highlight_color = WD_COLOR_INDEX.TURQUOISE
                    run.font.color.rgb = RGBColor(231, 76, 60)

                run.font.name = 'Times New Roman'
                run._element.rPr.rFonts.set(qn('w:eastAsia'), u'標楷體')
                run.font.size = Pt(14)
        else:
            p = doc.add_paragraph('')
            run = p.add_run(data[index])
            run.font.name = 'Times New Roman'
            run._element.rPr.rFonts.set(qn('w:eastAsia'), u'標楷體')
            run.font.size = Pt(14)
    
    # 儲存資料
    doc.save('new_data/output/' + file_name + '.docx')
    print(file_name + '.docx 轉換完成')
    return data, hyperlinks, currency, benefit, parameter, rate

In [11]:
# label清單建置
def label_checklist(doc, label_name, label_data):
    p = doc.add_paragraph('')
    run = p.add_run(label_name + '清單：')
    run.font.name = 'Times New Roman'
    run._element.rPr.rFonts.set(qn('w:eastAsia'), u'標楷體')
    run.font.size = Pt(14)
    
    p = doc.add_paragraph('')
    for index in range(len(label_data)):
        if index != (len(label_data)-1): 
            run = p.add_run(label_data[index]+"、")
        else:
            run = p.add_run(label_data[index])
        run.font.name = 'Times New Roman'
        run._element.rPr.rFonts.set(qn('w:eastAsia'), u'標楷體')
        run.font.size = Pt(14)

In [12]:
# item排序
def hyperlinks_sort(hyperlinks, bookmarks):
    clause_dict, schedule_dict = clause_index()
    clauses = list()
    schedules = list()
    result = list()
    for hyperlink in hyperlinks:
        tmp_data = re.findall('附表[\W\w\s]+', hyperlink)
        if len(tmp_data) > 0:
            schedules.append(hyperlink)
        else:
            hyperlink = hyperlink.replace('六', '六')
            if hyperlink in bookmarks:
                clauses.append(hyperlink)
    
    clauses.sort(key=lambda x:clause_dict[x])
    schedules.sort(key=lambda x:schedule_dict[x])
    result = clauses.copy()
    result.extend(schedules)
    
    return result

In [13]:
# label清單整合
def data_label(file_name, bookmarks, hyperlinks, currency, benefit, parameter, rate):
    doc = Document()
    #hyperlinks.sort()
    hyperlinks = hyperlinks_sort(hyperlinks, bookmarks)
    label_checklist(doc, 'Heading', bookmarks)
    doc.add_paragraph('')
    label_checklist(doc, 'item', hyperlinks)
    doc.add_paragraph('')
    label_checklist(doc, 'currency', currency)
    doc.add_paragraph('')
    label_checklist(doc, 'benefit', benefit)
    #label_checklist(doc, 'parameter', parameter)
    #label_checklist(doc, 'rate', rate)
    
    doc.save('new_data/output/' + file_name + '(label).docx')
    print(file_name + '(label).docx 轉換完成\n')

# 總程式執行

In [14]:
def bio_to_word(file_path, file_name):
    # 資料(bio)讀取
    raw_data = list()
    with open(file_path, mode="r", encoding="utf-8") as file:
        for line in file:
            if line != '\n':
                label = line.split('\t')[0]
                text = line.split('\t')[-1].split('\n')[0]
                raw_data.append([label, text])
            else:
                raw_data.append([line])

    # 資料欄位紀錄
    label_list = list()
    for row in raw_data:
        if (row[0] != '\n') and (row[0] not in label_list):
            label_list.append(row[0])
    
    # label統計
    statistics_data = data_statistics(raw_data, label_list)
    
    # bio轉list(放入label)
    list_data = bio_to_list(raw_data, label_list)
    
    # 抓label資料
    tmp_list_data_1 = list_data.copy()
    bookmarks, currency, benefit, parameter, rate, hyperlinks = catch_label(tmp_list_data_1)
    
    # list轉Word
    tmp_list_data_2 = list_data.copy()
    word_data = list_to_word(file_name, tmp_list_data_2, statistics_data,
                             bookmarks, currency, benefit,
                             parameter, rate, hyperlinks)
    
    # Heading、item、benefit清單整理
    data_label(file_name, bookmarks, hyperlinks, currency, benefit, parameter, rate)
    
    return list_data, word_data, bookmarks, hyperlinks, currency, benefit, parameter, rate, statistics_data

# BIO轉Word

In [15]:
def catch_bio():
    # 抓取目錄所有資料
    file_list = list()
    for root, dirs, files in os.walk("new_data/output"):
        file_list = files
        break
    
    # 抓取檔案為BIO檔，並建立路徑
    file_name_list = list()
    file_path_list = list()
    for file in file_list:
        if file.split(".")[-1] == "bio":
            file_name_list.append(file.split(".")[0])
            file_path_list.append("new_data/output/"+file) 
    
    return file_name_list, file_path_list

In [16]:
# file_name_list, file_path_list = catch_bio()

In [17]:
# for file_path, file_name in zip(file_path_list, file_name_list):
#     list_data, word_data, bookmarks, hyperlinks, currency, benefit, parameter, rate, statistics = bio_to_word(file_path, file_name)
# print("----------資料全部轉換完成-----------")

中國人壽達美樂外幣利率變動型終身壽險（美元）_銷售條款(Final).docx 轉換完成
中國人壽達美樂外幣利率變動型終身壽險（美元）_銷售條款(Final)(label).docx 轉換完成

中國人壽好有利利率變動型終身壽險－定期給付型_保單條款(final)(更新)_1090610.docx 轉換完成
中國人壽好有利利率變動型終身壽險－定期給付型_保單條款(final)(更新)_1090610(label).docx 轉換完成

中國人壽鑫美利外幣利率變動型終身壽險(美元)_銷售條款.docx 轉換完成
中國人壽鑫美利外幣利率變動型終身壽險(美元)_銷售條款(label).docx 轉換完成

中國人壽新享富增利利率變動型終身壽險(定期給付型) - 銷售條款.docx 轉換完成
中國人壽新享富增利利率變動型終身壽險(定期給付型) - 銷售條款(label).docx 轉換完成

----------資料全部轉換完成-----------
