In [2]:
from pptx import Presentation
import os
import re
import json
from country_named_entity_recognition import find_countries

In [3]:
ppt_file = r"""C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt\4COffshore_MarketOverviewReport_2021_Q1.pptx"""

In [25]:
def iter_to_nonempty_table_cells(tbl: object, text_len_threshold: int) -> str:
   """
   :param tbl: 'pptx.table.Table'

   :return: return iterator to non-empty rows
   """
   if len(tbl.rows) < 6 or len(tbl.columns) < 6:
      for ridx in range(sum(1 for _ in iter(tbl.rows))):
         for cidx in range(sum(1 for _ in iter(tbl.columns))):
            cell = tbl.cell(ridx, cidx)
            txt = type("")(cell.text)
            txt = txt.strip()
            if txt:
               # find if country name exists
               txt_with_countries = find_countries(txt)
               # get the font size (not robust)
               text_frame = cell.text_frame
               paragraph = text_frame.paragraphs[0]
               for run in paragraph.runs:
                  font = run.font
                  try:
                     # get font size (not robust)
                     font_size = font.size.pt
                  except:
                     font_size = 0

               if txt_with_countries:
                  yield txt + '\n'
               elif (len(txt.split(" ")) > text_len_threshold) and (font_size > 8):
                  yield txt + '\n'
               elif txt[-1] in '.':
                  yield txt + '\n'

In [4]:

def save_to_json(path: str, item: dict, item_name: str) -> None:
    with open(("\\\\?\\" + path + "\\" + item_name), "w") as outfile:
        json.dump(item, outfile, indent = 4)

In [67]:
from pptx.enum.shapes import MSO_SHAPE_TYPE 

ppt_path = r"""C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt"""

files = [x for x in os.listdir(ppt_path) if x.endswith(".pptx")] 

# ungroup all the grouped shapes so that the parser can recognize the text frames within a group.
for file in files:
    presentation = Presentation(ppt_path + "\\" + file)
    for slide in presentation.slides:
        for shape in slide.shapes:
            if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
                group = shape.element
                parent = group.getparent()
                index = parent.index(group)
                for member in group:
                    parent.insert(index, member)
                    index += 1
                parent.remove(group)
    os.chdir(ppt_path)
    presentation.save("ungroup_" + file)

In [35]:
# parse and store text separately

line_len_threshold = 4
text_len_threshold = 4
ppt_path = r"""C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt"""

files = [x for x in os.listdir(ppt_path) if x.endswith(".pptx")] 

for file in files:
    file_name = str(file).split(".")[0]
    folder_name = ppt_path + "\\" + file_name
    
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)

    prs = Presentation(ppt_path + "\\" + str(file))
    output_file = ""

    for i, slide in enumerate(prs.slides):
        output_page = ""
        if slide.shapes:
            output_file += "\n\n" + "---------- Page " + str(i+1) + "------------" + "\n"
            
            # sort the shapes by coord for each page
            shapes_coord = []
            for shape in slide.shapes:
                # find the coordinates of shape
                x = round(shape.left.inches,2)
                y = round(shape.top.inches,2)
                width = round(shape.width.inches,2)
                height = round(shape.height.inches,2)
                # append shape
                shapes_coord.append((x, y, width, height, shape))
            # sort the list of shapes
            shapes_coord_sort = sorted(shapes_coord, key=lambda element: (element[0], element[1]))

            item_counter = 1
            for j, item in enumerate(shapes_coord_sort):
                if i+1 < 10:
                    item_name = "pg"+str(0)+str(i+1)+"_text"+str(item_counter)+"_"+file_name
                else:
                    item_name = "pg"+str(i+1)+"_text"+str(item_counter)+"_"+file_name
                # create empty dict for each shape
                dict_shape = {"text": "", "text_type":"", "coordinates": item[:4], "doc_name": file_name, "doc_year": re.match(r'.*([1-3][0-9]{3})', file_name).group(1), "doc_quarter": file_name[-1], "page": str(i+1)}

                shape = item[4]
                

                # if the shape is a text frame with text
                if hasattr(shape, "text") and len(str(shape.text).strip().split(" ")) > 1:
                    # get the text from the text frame
                    shape_text = shape.text.strip() + "\n"

                    # get the font size (not robust)
                    text_frame = shape.text_frame
                    paragraph = text_frame.paragraphs[0]
                    for run in paragraph.runs:
                        font = run.font
                        try:
                            # get font size and bold check (not robust)
                            font_size = font.size.pt
                        except:
                            font_size = 0

                    # get the length of string for each line in the text frame
                    shape_len = [(len(line.strip().split(" "))) for line in (shape_text.strip().split("\n"))]

                    # if length is greater than threshold
                    if (max(shape_len) > text_len_threshold) and (font_size > 8 or font_size == 0) and (shape_text not in output_file):
                        output_page += "---------" + item_name + "----------\n"
                        output_page += shape_text

                        dict_shape["text"] = shape_text
                        dict_shape["text_type"] = "text"
                        save_to_json(path=folder_name, item=dict_shape, item_name=item_name)
                        item_counter += 1
                        
                    # if length not greater than threshold, check if the text is a title
                    elif (font_size > 20 or font_size == 0) and shape_text not in output_file:
                        output_page += "---------" + item_name + "----------\n"
                        output_page += shape_text

                        dict_shape["text"] = shape_text
                        dict_shape["text_type"] = "title"
                        save_to_json(path=folder_name, item=dict_shape, item_name=item_name)
                        item_counter += 1
                    
                    elif shape_text[-2] in '.':
                        output_page += "---------" + item_name + "----------\n"
                        output_page += shape_text

                        dict_shape["text"] = shape_text
                        dict_shape["text_type"] = "text"
                        save_to_json(path=folder_name, item=dict_shape, item_name=item_name)
                        item_counter += 1

                # for parsing table                
                elif shape.has_table:
                    tables_pg = list()
                    tables = shape.table
                    tables_pg.append(tables)
                    for table in tables_pg:
                        table_text_ = iter_to_nonempty_table_cells(table, text_len_threshold)
                        table_text = "".join(table_text_)
                    shape_len = [(len(line.strip().split(" "))) for line in (table_text.strip().split("\n"))]

                    if (len(shape_len) > line_len_threshold or max(shape_len) > text_len_threshold) and table_text not in output_file:
                        output_page += "---------" + item_name + "----------\n"
                        output_page += table_text
                        

                        dict_shape["text"] = table_text
                        dict_shape["text_type"] = "table"
                        save_to_json(path=folder_name, item=dict_shape, item_name=item_name)
                        item_counter += 1

                

        #if len(output_page.strip().split(" ")) > text_len_threshold:
        output_file += output_page
        os.chdir(ppt_path)
        with open(file_name+".txt", "w", encoding="utf-8") as text_file:
            text_file.write(str(output_file))

In [29]:
# parse and store text together within one page

line_len_threshold = 4
text_len_threshold = 4
ppt_path = r"""C:\Users\fangning.zheng\Documents\weekly summary\week05\nlp-experiments-gists\fangzhen\langchain_streamlit_chatbot\src\langchain_streamlit_chatbot\data_ppt"""

files = [x for x in os.listdir(ppt_path) if x.endswith(".pptx")] 

for file in files:
    file_name = str(file).split(".")[0]
    folder_name = ppt_path + "\\" + file_name
    
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)

    prs = Presentation(ppt_path + "\\" + str(file))
    output_file = ""

    for i, slide in enumerate(prs.slides):
        output_page = ""
        output_text_page = ""

        if slide.shapes:            
            # sort the shapes by coord for each page
            shapes_coord = []
            for shape in slide.shapes:
                # find the coordinates of shape
                x = round(shape.left.inches,2)
                y = round(shape.top.inches,2)
                width = round(shape.width.inches,2)
                height = round(shape.height.inches,2)
                # append shape
                shapes_coord.append((x, y, width, height, shape))
            # sort the list of shapes
            shapes_coord_sort = sorted(shapes_coord, key=lambda element: (element[0], element[1]))

            # create empty dict for each shape
            dict_shape = {"text": "", "text_type":"", "doc_name": file_name, "doc_year": re.match(r'.*([1-3][0-9]{3})', file_name).group(1), "doc_quarter": file_name[-1], "page": str(i+1)}

            #item_counter = 1
            for j, item in enumerate(shapes_coord_sort):
                if i+1 < 10:
                    # add a zero before numbering
                    item_name = "pg"+str(0)+str(i+1)+"_text"+"_"+file_name
                else:
                    item_name = "pg"+str(i+1)+"_text"+"_"+file_name
                shape = item[4]
                
                # if the shape is a text frame with text
                if hasattr(shape, "text") and len(str(shape.text).strip().split(" ")) > 1:
                    # get the text from the text frame, and replace multiple newlines into one newline
                    shape_text_ = re.sub("\n+", "\n", shape.text.strip())
                    shape_text = shape_text_ + "\n" + "\n"

                    # get the font size (not robust)
                    text_frame = shape.text_frame
                    paragraph = text_frame.paragraphs[0]
                    for run in paragraph.runs:
                        font = run.font
                        try:
                            # get font size and bold check (not robust)
                            font_size = font.size.pt
                        except:
                            font_size = 0

                    # get the length of string for each line in the text frame
                    shape_len = [(len(line.strip().split(" "))) for line in (shape_text.strip().split("\n"))]

                    # if length is greater than threshold
                    if (max(shape_len) > text_len_threshold) and (font_size > 8 or font_size == 0) and (shape_text not in output_file):
                        output_page += shape_text
                        output_text_page += shape_text

                        """dict_shape["text"] = shape_text
                        dict_shape["text_type"] = "text"""
                        """save_to_json(path=folder_name, item=dict_shape, item_name=item_name)
                        item_counter += 1"""
                        
                    # if length not greater than threshold, check if the text is a title
                    elif (font_size > 20 or font_size == 0) and shape_text not in output_file:
                        output_page += shape_text
                        output_text_page += shape_text

                    
                    elif shape_text[-2] in '.':
                        output_page += shape_text
                        output_text_page += shape_text

                # for parsing table                
                elif shape.has_table:
                    tables_pg = list()
                    tables = shape.table
                    tables_pg.append(tables)
                    for table in tables_pg:
                        table_text_ = iter_to_nonempty_table_cells(table, text_len_threshold)
                        table_text = "".join(table_text_)
                    shape_len = [(len(line.strip().split(" "))) for line in (table_text.strip().split("\n"))]

                    if (len(shape_len) > line_len_threshold or max(shape_len) > text_len_threshold) and table_text not in output_file:
                        output_page += table_text
                        output_text_page += table_text

        if output_text_page.strip():
            # store into json file
            dict_shape["text"] = output_text_page
            dict_shape["text_type"] = "text"
            save_to_json(path=folder_name, item=dict_shape, item_name=item_name)
            
            output_file += "\n\n" + "---------- Page " + str(i+1) + "------------" + "\n"
            output_file += output_page
            os.chdir(ppt_path)
            with open(file_name+".txt", "w", encoding="utf-8") as text_file:
                text_file.write(str(output_file))


