# PDF to DOCX Converter

## This script converts a PDF into a DOCX. Each page of the PDF is rendered to a JPG and then inserted into a new Word document.

Libraries used:
- python-docx [https://pypi.org/project/python-docx/]
- PyPDF2 [https://pypi.org/project/PyPDF2/]
- pdf2image [https://pypi.org/project/pdf2image/]

Binaries required:
- install Poppler:  https://github.com/oschwartz10612/poppler-windows/releases/ and make sure it's in the PATH
- or install via conda install -c conda-forge poppler 

In [None]:
!!pip install python-docx
!!pip install PyPDF2
!!pip install pdf2image
# Download and install Poppler (PDF tools) https://github.com/oschwartz10612/poppler-windows/releases/
# or install via conda
!!conda install -c conda-forge poppler

In [None]:
def pdf2jpeg_poppler(pdf_input_path, dpi=200):
    from pdf2image import convert_from_path
    import os
    import tempfile

    num_pages = get_PDF_pagecount(pdf_input_path)
    print("PDF has %d pages" % num_pages)
        
    temp_fn_lst = []
    fname_jpg_pattern = 'tmp_pdf_to_jpg_page_%d.jpg'
    i = 0
        
    pages = convert_from_path(pdf_input_path)
    
    for page in pages:
        i = i + 1
        path = os.path.join(tempfile.gettempdir(),  fname_jpg_pattern % i)
        temp_fn_lst.append(path)
        page.save(path, 'JPEG')
        print("Saved screenshot of page %d to tempfile %s" % (i, path))
        
    return temp_fn_lst

def is_pdf_landscape(input_pdf_fname):
    from PyPDF2 import PdfFileReader
    with open(input_pdf_fname,'rb') as f:
        pdf = PdfFileReader(f)
        
        page = pdf.getPage(0).mediaBox

        if page.getUpperRight_x() - page.getUpperLeft_x() > page.getUpperRight_y() - page.getLowerRight_y():
            return True
    return False
    
def get_PDF_pagecount(input_pdf_fname):
    from PyPDF2 import PdfFileReader
    with open(input_pdf_fname,'rb') as f:
        pdf = PdfFileReader(f)
        
        return pdf.getNumPages()
    return -1

def create_docx_from_jpgs(fn_lst, output_docx_fname, fLandscapeMode = False):
    from docx import Document
    from docx.shared import Inches
    from docx.enum.section import WD_ORIENT
    
    try:
        document = Document()

        sections = document.sections
        margin = 0.5
        for section in sections:
            if fLandscapeMode:
                print('Formatting landscape orientation')
                section.orientation = WD_ORIENT.LANDSCAPE
                new_width, new_height = section.page_height, section.page_width
                section.page_width = new_width
                section.page_height = new_height                
                
            section.top_margin = Inches(margin)
            section.bottom_margin = Inches(margin)
            section.left_margin = Inches(margin)
            section.right_margin = Inches(margin)

        for jpg_fname in fn_lst:
            if fLandscapeMode:
                document.add_picture(jpg_fname, width=Inches(10))
            else:
                document.add_picture(jpg_fname, width=Inches(7.5))
            document.save(output_docx_fname)
        print("DOCX saved to: %s" % output_docx_fname)
    except Error as e:
        if document is not None:
            document.save()

def convert_pdf_to_DOCX(input_pdf_fname, output_docx_fname):
    import os
    temp_fn_lst = pdf2jpeg_poppler (input_pdf_fname)
    
    fLandscapeMode = is_pdf_landscape (input_pdf_fname)
   
    create_docx_from_jpgs(temp_fn_lst, output_docx_fname, fLandscapeMode)

    # cleanup screenshots
    for fname in temp_fn_lst:
        os.remove(fname)
        

##  Create download link for converted DOCX

In [3]:
from IPython.display import display, Markdown, clear_output

import ipywidgets as widgets
from ipywidgets import Layout
# defining some widgets

layout = widgets.Layout(width='auto', height='40px') #set width and height
text_box = widgets.Text(
       value='http://www.grancorporation.com/Ellipse_Perimeter_Rev-.pdf',
       description='URL to PDF', display='flex', layout=layout)

def download_link(file_name):
    """Display a download link to a file within a Jupyter notebook"""
    from IPython.display import FileLink
    display(FileLink(file_name))
    
def clicked(arg):
    from urllib.parse import urlparse
    import os 
    import urllib.request
    url_to_pdf = text_box.value

    o = urlparse(url_to_pdf)
    output_docx_fname = os.path.basename(o.path.replace(".pdf", ".docx"))  # you can override this

    if '.docx' not in output_docx_fname:
        output_docx_fname = output_docx_fname + '.docx'

    # download PDF and convert it
    input_pdf_fname = 'local-download-pdf.pdf'

    urllib.request.urlretrieve(url_to_pdf, input_pdf_fname)

    convert_pdf_to_DOCX(input_pdf_fname, output_docx_fname)
    print('Download link below')
    download_link(output_docx_fname)

button_convert = widgets.Button(description = 'Convert')   
button_convert.on_click(clicked)
display(text_box)
display(button_convert)

Text(value='http://www.grancorporation.com/Ellipse_Perimeter_Rev-.pdf', description='URL to PDF', layout=Layou…

Button(description='Convert', style=ButtonStyle())