# Read MS Word Documents

Read MS Word Docx Documents with Python and the `python-docx` library, and extract text, images, and tables. 

### Install the Required Libraries

In [None]:
%pip install python-docx pandas


### Python Imports

In [13]:
import sys
sys.path.append('..\\code')

import os
from dotenv import load_dotenv
load_dotenv()

from docx import Document
import pandas as pd
from docx.shared import Inches
from docx.image.image import Image

from IPython.display import display, Markdown, HTML
from PIL import Image
from doc_utils import *
from utils.bcolors import bcolors as bc  


def show_img(img_path, width = None):
    if width is not None:
        display(HTML(f'<img src="{img_path}" width={width}>'))
    else:
        display(Image.open(img_path))

### Code Definitions

Defining the function that will read in the docx file and return the text, images and tables as a list of strings, list of images and list of pandas dataframes respectively.

In [15]:

def docx_extract(doc_path, images_folder='extracted_images'):
    doc = Document(doc_path)
    all_text = []
    tables = []
    images = []
    image_count = 0
    
    # Ensure the images_folder exists
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)
    
    # Extract all text
    for para in doc.paragraphs:
        all_text.append(para.text)
    
    # Extract all tables
    for table in doc.tables:
        headers = [cell.text for cell in table.rows[0].cells]
        data = []
        for row in table.rows[1:]:  # Skip header row
            data.append([cell.text for cell in row.cells])
        df = pd.DataFrame(data, columns=headers)
        tables.append(df)
    
    # Extract all images
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            img = rel.target_part.blob
            image_path = os.path.join(images_folder, f'image{image_count}.jpg')
            with open(image_path, 'wb') as img_file:
                img_file.write(img)
            image_count += 1
            images.append(image_path)
    
    concatenated_text = '\n'.join(all_text)
    
    return concatenated_text, tables, images


In [16]:
doc_path = 'sample_data/1_London_Brochure.docx'
concatenated_text, tables, images = docx_extract(doc_path)

# You can print the concatenated text, the number of tables, and images extracted
print(f"{bc.OKBLUE}Text Snippet:\n{bc.OKGREEN}{concatenated_text[:100]}{bc.ENDC}\n\n")
print(f"Extracted {len(tables)} tables.")
print(f"Extracted {len(images)} images.")

[94mText Snippet:
[92mMargie’s Travel Presents…
London
London is the capital and most populous city of England and the Uni[0m


Extracted 1 tables.
Extracted 2 images.


In [20]:
show_img(images[0], width=500)

In [21]:
tables[0].head()

Unnamed: 0,Category,Information
0,Country,United Kingdom
1,Capital Of,England
2,Currency,Pound Sterling (GBP)
3,Population (2021 census),Approximately 8.8 million
4,Famous For,"Historical landmarks, museums, cultural diversity"
