In [1]:
!pip install -q python-docx

In [2]:
import os
import re
import docx
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph

In [3]:
def iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
        # print(parent_elm.xml)
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

In [4]:
# Python內建filter()函式 - 過濾list
# filter()把傳入的函式依次作用於每個元素，然後根據返回值是True還是False決定保留還是丟棄該元素
def not_empty(s):
    return s and s.strip()

In [5]:
# 儲存資料
def save_file(file_name, line, check_new_save):
    save_file_path = 'new_data/data_process/'
    if not os.path.isdir(save_file_path):
        os.mkdir(save_file_path)
    
    save_file_name = 'new_data/data_process/' + file_name + '.bio'
    
    if check_new_save == False:
        text_file=open(save_file_name,'w',encoding="utf-8")
    else:
        text_file=open(save_file_name,'a',encoding="utf-8")
    
    for sen in line:
        text_file.write(sen+'\n')
    text_file.write('\n')
    text_file.close()

In [6]:
# 分割資料
def split_part(line, next_line):
    #抓取目前段落"。"前的字，並把"。"後的段落放入新段落
    line_list = line.split("。")
    line_list = list(filter(not_empty, line_list))
    output_line = ""
    i = 0
    for row in line_list:
        if (i < len(line_list)-1):
            output_line += (row + "。")
            i += 1

    new_line = line_list[-1] + next_line
    return output_line, new_line

In [7]:
def word_to_bio(file_path, file_name):
    #print("開始轉換："+file_name+".docx")
    doc = docx.Document(file_path)
    #print("檔案內含段落數：",len(doc.paragraphs),"\n")

    # 抓取Word資料
    word_text_list = list()
    for block in iter_block_items(doc):
        if isinstance(block, Paragraph):
            word_text_list.append(block.text)
        else:
            for row in block.rows:
                for cell in row.cells:
                    word_text_list.append(cell.text)

    # 整理Word資料(去除\n)
    text_list = list()
    for text in word_text_list:
        text = text.split("\n")
        text = list(filter(not_empty, text))
        text_list.extend(text)
    
    # 分割段落
    str_line = ""
    str_list = list()
    for line in text_list:
        line = line.strip()
        line = line.replace("\n", "")
        if line != "":
            find_title = re.match('【[\W\w\S\s]+】', line)
            if find_title:
                find_footer = re.match('【[A-Za-z0-9\s]+】', line)
                #if not find_footer and len(str_line) <= 500:
                if not find_footer:
                    str_list.append(str_line)
                    str_line = line
            else:
                if ((len(str_line)+len(line)) > 500):
                    output_line, new_line = split_part(str_line, line)
                    if output_line != '':
                        str_list.append(output_line)
                    str_line = new_line
                else:
                    str_line += line
    str_list.append(str_line)
                    
    # 儲存到bio檔
    check_new_save = False
    for line in str_list:
        save_file(file_name, line, check_new_save)
        check_new_save = True
    
    print("轉換完成："+file_name + ".bio\n")

In [8]:
def catch_word():
    # 抓取目錄所有資料
    file_list = list()
    for root, dirs, files in os.walk("new_data/input"):
        file_list = files
        break

    # 抓取檔案為Word(docx)檔，並建立路徑
    file_path_list = list()
    file_name_list = list()
    for file in file_list:
        if file.split(".")[-1] == "docx":
            file_path_list.append("new_data/input/"+file)
            file_name_list.append(file.split(".")[0])
    
    return file_path_list, file_name_list

In [9]:
# file_path_list, file_name_list = catch_word()

In [12]:
# for file_path, file_name in zip(file_path_list, file_name_list):
#     word_to_bio(file_path, file_name)
# print("----------資料全部轉換完成-----------")