### Extracting data from MS Word files

- Create a sample file

In [None]:
# Install python-docx if you haven't done it already.
# I am not including tyhe output of this code due to space constraints. 
!pip install python-docx

- Below script creates a sample Word demo document containing a title, multiple paragraphs, bullet points, and a table.

In [5]:
from docx import Document
from docx.shared import Pt

# Create a new blank Document.
doc = Document()

# Add a title
doc.add_heading('Sample Document For NLP Analysis', level=1)

# Add a couple of paragraphs.
doc.add_paragraph('This sample Word demo document contains multiple paragraphs.')
doc.add_paragraph('Here is another paragraph with more text.')

# Add few bullet points.
doc.add_paragraph('Sample bullet 1', style='List Bullet')
doc.add_paragraph('Sample bullet 2', style='List Bullet')
doc.add_paragraph('Sample bullet 3', style='List Bullet')

# Add a table for demo.
table = doc.add_table(rows=3, cols=2)
table.style = 'Table Grid'

# Add header row
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Header Sample 1'
hdr_cells[1].text = 'Header Sample 2'

# Add a couple of data rows
row_cells = table.rows[1].cells
row_cells[0].text = 'Sample Row 1'
row_cells[1].text = 'Data Point 1'

row_cells = table.rows[2].cells
row_cells[0].text = 'Sample Row 1'
row_cells[1].text = 'Data Point 2'

# Save the newly created document.
doc.save('NLP_demo_sample_file.docx')

- The below script oads this Word document. 
- Then it extracts and prints various elements of the document like  title, paragraphs, bullet points, and table data.

In [7]:
from docx import Document

# First load the Word document created above.
doc = Document('NLP_demo_sample_file.docx')

# Extract and print the title 
# Assume it to be the opening paragraph.
print(f"Title: {doc.paragraphs[0].text}")

# Extract and print all paragraphs.
print("\nParagraphs:")
for para in doc.paragraphs:
    print(para.text)

# Extract and print bullet points (assuming they are in a list)
print("\nBullet Points:")
for para in doc.paragraphs:
    if para.style.name == 'List Bullet':
        print(para.text)

# Extract and print table data
print("\nTable Data:")
for table in doc.tables:
    for row in table.rows:
        for cell in row.cells:
            print(cell.text, end=' | ')
        print()

Title: Sample Document For NLP Analysis

Paragraphs:
Sample Document For NLP Analysis
This sample Word demo document contains multiple paragraphs.
Here is another paragraph with more text.
Sample bullet 1
Sample bullet 2
Sample bullet 3

Bullet Points:
Sample bullet 1
Sample bullet 2
Sample bullet 3

Table Data:
Header Sample 1 | Header Sample 2 | 
Sample Row 1 | Data Point 1 | 
Sample Row 1 | Data Point 2 | 
