In [None]:
TODO:
complete final exercise at bottom of notebook

# python-docx

#### python-docx is a Python library for creating and updating Microsoft Word (.docx) files.

In [313]:
from IPython.core.display import display, HTML

sqlalchemy_url = 'https://python-docx.readthedocs.io/en/latest/'
iframe = '<iframe src={} width=1100 height=300></iframe>'.format(sqlalchemy_url)
HTML(iframe)

In [326]:
import os
import configparser

config = configparser.ConfigParser()
config.read('config.ini')
RAW_DATA = config['USER']['RAW_DATA']
DOC_PATH = config['DOCX']['DOC_PATH']
XML_PATH = config['DOCX']['XML_PATH']

DOCX_PATH = os.path.join(RAW_DATA, DOC_PATH)
XML_PATH = os.path.join(RAW_DATA, XML_PATH)

In [328]:
# the Document method reads the text, style, and formatting
# of a word .docx document

import docx
doc = docx.Document(DOCX_PATH)

### Paragraphs

Word paragraphs contain the text of the document. However, the table text, headers, footers, are not included in paragraphs.

In [290]:
# get all paragraphs 
paragraphs = doc.paragraphs

In [291]:
# count all paragraphs in the document
len(paragraphs)

2675

In [292]:
# only include with text (ignore empty strings)
paragraphs = [p for p in paragraphs if p.text.strip() != '']

### Style

In [127]:
# view the text in the first paragraph
paragraphs[0].text

'ORACLE CORP'

In [276]:
# get the paragraph style
paragraphs[0].style.name

'Normal'

In [330]:
# Identify if paragraph text has 'HEADING' style
# HEADING is always uppercase 

'HEADING' in paragraphs[0].style.name

False

### Runs

Each paragraph may contain one or more runs. A run denotes the style attached to the text in a paragraph. Every time the style change (e.g. from bold to normal text) a new run is added.

In [210]:
runs = paragraphs[0].runs
runs

[<docx.text.run.Run at 0x1580291d668>,
 <docx.text.run.Run at 0x1580291d908>,
 <docx.text.run.Run at 0x1580291d860>,
 <docx.text.run.Run at 0x1580291d828>]

In [255]:
# each run contains a portion of text from the paragraph
run = runs[0]
run.text

'ORACLE'

### Run style

- Each run contains style information such as bold, italic, or underline. 
- The style information will be True, False, or None
- A value of None indicates the run has no directly-applied style value and so will inherit the value of its containing paragraph.

In [256]:
print(run.bold)

True


In [257]:
print(run.italic)

None


In [258]:
print(run.underline)

None


In [259]:
# font size
run.font.size.pt

28.0

In [236]:
# View all the runs in the paragraph

In [237]:
[run.text for run in runs]

['ORACLE', ' ', 'CORP', '']

In [238]:
# View all the run stlye 
[run.bold for run in runs]

[True, True, True, None]

## Exercise

#### Find all the bold runs

- Iterate through all the runs in all the paragraphs to identify any run with a bold style applied.
- Store all the bold text in a list named bold_text
- Do not include empty strings (e.g. '') 
- Replace all tabs with a space
- Strip the text
- print the first 10 items in bold_text

In [277]:
bold_text = []
for paragraph in paragraphs:
    for run in paragraph.runs:
        if run.bold and run.text.strip() != '':
            text = run.text.replace('\t',' ').strip()
            bold_text.append(text)

bold_text[0:10]

['ORACLE',
 'CORP',
 'FORM',
 '10-K',
 'UNITED',
 'STATES',
 'SECURITIES',
 'AND',
 'EXCHANGE COMMISSION',
 'FORM']

### Create a function to determine if all runs in a paragraph are bold

- Name the function is_bold
- Return True if all runs (with text) in a paragraph are bold
- Test the function by adding all the bold paragraphs to a list named bold_paragraphs
- Print the first 10 paragraphs in bold_paragraphs

In [279]:
# create the function is_bold
def is_bold(paragraph):   
    runs_are_bold = [run.bold for run in paragraph.runs if run.text != '']

    if runs_are_bold != [] and all(runs_are_bold):
        return True
    return False

In [280]:
# test the is_bold function
bold_paragraphs = []
for paragraph in paragraphs:
    if is_bold(paragraph):
        bold_paragraphs.append(paragraph.text)

bold_paragraphs[0:10]

['ORACLE CORP',
 'FORM 10-K',
 'UNITED STATES',
 'SECURITIES AND EXCHANGE COMMISSION',
 'FORM 10-K',
 'TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934',
 'For the transition period from\tto  \t Commission file number: 001-35992',
 'Oracle Corporation',
 '(Exact name of registrant as specified in its charter)',
 '(State or other jurisdiction of incorporation or organization)']

### Tables

In [113]:
# identify all document tables
tables = doc.tables

In [101]:
# count the document tables
len(tables)

74

In [281]:
table_cells = [cell.text.strip() 
               for cell in table._cells if cell.text != '']

table_cells[0:10]

['TABLE OF CONTENTS',
 'Page',
 'PART I.',
 'Item 1.',
 'Business',
 '3',
 'Item 1A.',
 'Risk Factors',
 '19',
 'Item 1B.']

### Core Properties

In [302]:
doc.core_properties.title

'ORACLE CORP'

In [301]:
doc.core_properties.subject

'FORM 10-K (Annual Report) Filed 06/25/15 for the Period Ending 05/31/15'

In [298]:
doc.core_properties.author

'EDGAR Online, Inc.'

In [308]:
doc.core_properties.created

datetime.datetime(2017, 10, 9, 1, 40, 51)

In [305]:
doc.core_properties.revision

0

## Explore docx xml
Every word document is a zip of xml files. To test this, change the extension of any word file from .docx to .xml. 

Inside each zip, a directory named word contains document.xml. This file contains all of the xml for the word document.

To open the zip we use the package zipfile

In [329]:
import zipfile

zip = zipfile.ZipFile(XML_PATH, 'r')
xml_content = zip.read('word/document.xml')

### zipfile

ZipFile - The class for reading and writing ZIP files
read - Returns the bytes content from a zipfile

In [314]:
from IPython.core.display import display, HTML

sqlalchemy_url = 'https://docs.python.org/2/library/zipfile.html#zipfile-objects'
iframe = '<iframe src={} width=1100 height=300></iframe>'.format(sqlalchemy_url)
HTML(iframe)

In [204]:
from bs4 import BeautifulSoup

b = BeautifulSoup(xml_content, 'lxml')

In [209]:
# view the xml from a short document with one heading and one sentence
for word in b.find('w:body'):
    print(word)
    print()

<w:p w:rsidp="00A96863" w:rsidr="007F6AD8" w:rsidrdefault="00A96863"><w:ppr><w:pstyle w:val="Heading1"></w:pstyle></w:ppr><w:r><w:t>Section Header</w:t></w:r></w:p>

<w:p w:rsidr="00A96863" w:rsidrdefault="00A96863"><w:r><w:t>Text in the section</w:t></w:r><w:bookmarkstart w:id="0" w:name="_GoBack"></w:bookmarkstart><w:bookmarkend w:id="0"></w:bookmarkend></w:p>

<w:sectpr w:rsidr="00A96863"><w:pgsz w:h="15840" w:w="12240"></w:pgsz><w:pgmar w:bottom="1440" w:footer="720" w:gutter="0" w:header="720" w:left="1440" w:right="1440" w:top="1440"></w:pgmar><w:cols w:space="720"></w:cols><w:docgrid w:linepitch="360"></w:docgrid></w:sectpr>



### docx XML tag definitions
- < w:body > - contains the document paragraphs
- < w:p > - Document paragraph
- < w:pstyle > Document Style (e.g. Header 1)
- < w:t > text in a paragraph or run
- < w:bookmarkstart > defines a bookmark, such as a link in a table of contents
- < w:r > - Document runs. Every time the style in a paragraph changes, for instance a bold or underline term, a new run is added. Each paragraph may contain multiple runs.


### Exercise

Identify which documents in the provided zip (INSERT_ZIP_NAME) talk about SOME_REGULATION.