# Working on PDF Files

### Some Python tips and tricks

In [47]:
import os

os.system(command="cls")

text: str = "Hello, How are you?\nI am Dariush\tTasdighi."

print("-" * 50)
print(text)
print("-" * 50)
print(repr(text))
print("-" * 50)
print([text])
print("-" * 50)
display(text)
print("-" * 50)

--------------------------------------------------
Hello, How are you?
I am Dariush	Tasdighi.
--------------------------------------------------
'Hello, How are you?\nI am Dariush\tTasdighi.'
--------------------------------------------------
['Hello, How are you?\nI am Dariush\tTasdighi.']
--------------------------------------------------


'Hello, How are you?\nI am Dariush\tTasdighi.'

--------------------------------------------------


In [1]:
import pypdf

print("pypdf version:", pypdf.__version__)

pypdf version: 5.3.1


### آدرس‌دهی مطلق / آدرس‌دهی فیزیکی

In [2]:
import os

pdf_file_path: str = (
    "D:\\Source_Codes\\Learning Python - PDF\\ALL_IN_ONE\\pdf_files\\sample_01.pdf"
)

if os.path.isfile(pdf_file_path):
    print("File exists.")
else:
    print("File does not exist!")

File exists.


In [3]:
import os

pdf_file_path: str = (
    "D:/Source_Codes/Learning Python - PDF/ALL_IN_ONE/pdf_files/sample_01.pdf"
)

if os.path.isfile(pdf_file_path):
    print("File exists.")
else:
    print("File does not exist!")

File exists.


### آدرس‌دهی نسبی، نسبت به جایی که هستیم

- Best Practice

In [4]:
import os

pdf_file_path: str = "./pdf_files/sample_01.pdf"

if os.path.isfile(pdf_file_path):
    print("File exists.")
else:
    print("File does not exist!")

File exists.


### Get PDF File Page Count

In [5]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"
reader = PdfReader(stream=pdf_file_path)

page_count = len(reader.pages)
print("Number of Pages:", page_count)

Number of Pages: 179


### Extract All Images from PDF File

- ImportError: pillow is required to do image extraction. It can be installed via 'pip install pypdf[image]'

In [15]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"
reader = PdfReader(stream=pdf_file_path)

total_image_count: int = 0

for page in reader.pages:
    for image in page.images:
        total_image_count += 1

        print()
        print("-" * 50)
        print(f"Image ({total_image_count}):")
        print()
        print("Image File Name:", image.name)
        # print(f"Image File Size (Bytes): {len(image.data)}")
        print(f"Image File Size (Bytes): {len(image.data):,}")
        print(image)
        print("-" * 50)

print("Finished")


--------------------------------------------------
Image (1):

Image File Name: R45.jpg
Image File Size (Bytes): 355,882
ImageFile(name=R45.jpg, data: 355.9 kB)
--------------------------------------------------

--------------------------------------------------
Image (2):

Image File Name: R51.jpg
Image File Size (Bytes): 239,187
ImageFile(name=R51.jpg, data: 239.2 kB)
--------------------------------------------------

--------------------------------------------------
Image (3):

Image File Name: R60.jpg
Image File Size (Bytes): 339,962
ImageFile(name=R60.jpg, data: 340.0 kB)
--------------------------------------------------

--------------------------------------------------
Image (4):

Image File Name: R82.jpg
Image File Size (Bytes): 1,801
ImageFile(name=R82.jpg, data: 1.8 kB)
--------------------------------------------------

--------------------------------------------------
Image (5):

Image File Name: R93.jpg
Image File Size (Bytes): 3,181
ImageFile(name=R93.jpg, data: 3.

### Save All Images

In [None]:
from pypdf import PdfReader

target_path: str = "./image_files/sample_01"
if not os.path.isdir(target_path):
    os.mkdir(target_path)

pdf_file_path: str = "./pdf_files/sample_01.pdf"
reader = PdfReader(stream=pdf_file_path)

total_image_count: int = 0

for page in reader.pages:
    for image in page.images:
        total_image_count += 1

        image_file_path: str = f"{target_path}/{image.name}"
        with open(file=image_file_path, mode="wb") as file:
            file.write(image.data)

print("Finished")

Finished


### Save All Images - Fun - With Original Image Names

In [None]:
from pypdf import PdfReader

target_path: str = "./image_files/sample_05_original_names"
if not os.path.isdir(target_path):
    os.mkdir(target_path)

pdf_file_path: str = "./pdf_files/sample_05.pdf"
reader = PdfReader(stream=pdf_file_path)

for page in reader.pages:
    for image in page.images:
        image_file_path: str = f"{target_path}/{image.name}"
        with open(file=image_file_path, mode="wb") as file:
            file.write(image.data)

print("Finished")

Finished


### Save All Images - Fun - With Our Names!

In [None]:
from pypdf import PdfReader

target_path: str = "./image_files/sample_05"
if not os.path.isdir(target_path):
    os.mkdir(target_path)

pdf_file_path: str = "./pdf_files/sample_05.pdf"
reader = PdfReader(stream=pdf_file_path)

for page_index, page in enumerate(reader.pages):
    for image_index, image in enumerate(page.images):
        new_page_index: int = str(page_index + 1).zfill(3)
        new_image_index: int = str(image_index + 1).zfill(2)

        image_file_path: str = (
            f"{target_path}/image_{new_page_index}_{new_image_index}.jpg"
        )

        with open(file=image_file_path, mode="wb") as file:
            file.write(image.data)

print("Finished")

Finished


### Save All Images - Serious PDF File but Some BUG!

In [None]:
from pypdf import PdfReader

target_path: str = "./image_files/sample_04"
if not os.path.isdir(target_path):
    os.mkdir(target_path)

pdf_file_path: str = "./pdf_files/sample_04.pdf"
reader = PdfReader(stream=pdf_file_path)

for page in reader.pages:
    for image in page.images:
        image_file_path: str = f"{target_path}/{image.name}"
        with open(file=image_file_path, mode="wb") as file:
            file.write(image.data)

print("Finished")

Im0.jpg


NotImplementedError: Unsupported filter /JBIG2Decode

### Save All Images - Serious PDF File - Fix the BUG!

##### Step (1)

In [None]:
from pdf2image import convert_from_path

DPI: int = 200  # Default: 200
POPPLER_PATH: str = "./install/poppler/Library/bin"

pdf_file_path: str = "./pdf_files/sample_04.pdf"
pages = convert_from_path(pdf_path=pdf_file_path, dpi=DPI, poppler_path=POPPLER_PATH)
page_count = len(pages)
print("Number of Pages:", page_count)

Number of Pages: 70


##### Step (2)

In [39]:
from pdf2image import convert_from_path

DPI: int = 200  # Default: 200
POPPLER_PATH: str = "./install/poppler/Library/bin"

target_path: str = "./image_files/sample_04"
if not os.path.isdir(target_path):
    os.mkdir(target_path)

pdf_file_path: str = "./pdf_files/sample_04.pdf"
pages = convert_from_path(pdf_path=pdf_file_path, dpi=DPI, poppler_path=POPPLER_PATH)

for page_index, page in enumerate(pages):
    new_page_index: str = str(page_index + 1).zfill(4)
    image_path_name: str = f"{target_path}/image_{new_page_index}.jpg"
    page.save(fp=image_path_name, format="JPEG")

print("Finished")

Finished


### Convert Image to Text (OCR)

In [40]:
import PIL

print("PIL version:", PIL.__version__)

PIL version: 11.1.0


In [41]:
import pytesseract

print(pytesseract.__version__)

0.3.13


In [42]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = (
    "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
)

# 'fas' is for Persian (Farsi)!
languages = pytesseract.get_languages()
print(languages)

['eng', 'fas', 'osd']


In [44]:
import pytesseract
from PIL import Image

pytesseract.pytesseract.tesseract_cmd = (
    "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
)

filename: str = "image_0004.jpg"
path: str = "./image_files/sample_04"
image_file_path: str = f"{path}/{filename}"

image = Image.open(fp=image_file_path)
image.show()

text = pytesseract.image_to_string(image=image, lang="fas")

print("-" * 50)
print(text)
print("-" * 50)
print(repr(text))
print("-" * 50)

--------------------------------------------------
7

کیش سرمایه‌داری

پول در بنا نهادن امپراتوری‌ها و همچنین در پیشرفت علم نقش اساسی
داشته است. اما آیا پول هدف نهایی این کارهاست یا شاید فقط ضرورتی
است آسیبزا؟

درک نقش حقیقی اقتصاد در تاریخ معاصر آسان نیست. در مورد اينکه
پول چگونه سرمنشاً شکلگیری دولت‌ها و ویرانی‌شان شد. افق‌های نوینی
گشود. میلیون‌ها نفر را به بردگی کشید. چرخ‌های صنعت را به‌گردش
درآورد و صدها گونه از موجودات را به انقراض کشاند کتاب‌ها نوشته
شده است. اما برای درک تاریخ اقتصاد مدرن ناگزیر از درک تتها یک واژه
هستیم: رشد. اقتصاد مدرن. خوب یا بد» در بیماری و سلامت همچون
نوجوانی آ کنده از هورمون رشد کرده است. هر چه را بیابد می‌بلعد و سریع‌تر

در طول بخش اعظم تاریخ, اقتصاد تقریباً رشد نداشت. تولید جهانی
افزایش می‌یافت. اما این افزایش بیشتر به دلیل افزايش جمعیت و استقرار
در سرزمینهای جدید بود. ولی تولید سرانه ثابت ماند. در عصر مدرن همه
چیز تغییر کرد. در سال ۰۱۵۰۰ تولید جهانی کالاها و خدمات معادل
۰ میلبارد دلار بود؛ امروز در حدود ۶۰تریلیون دلار است. مهم‌تر این که

-------------

### Best Online OCR

- Google Gemini 2.0 Flash
    - https://aistudio.google.com/prompts/new_chat

![My Image](./image_files/image.jpg "My Image")

##### System Prompt

<div dir="rtl">
به عنوان یک OCR حرفه‌ای، متن این عکس را استخراج کن و برام بنویس. در صورتی که نتوانستی کلمه‌ای را به درستی تشخیص بدهی، با توجه به متن صفحه، بهترین کلمه‌ای که مناسب می‌باشد را بنویس. متن باید به صورت پیوسته نوشته شود و صرفا زمانی که به پاراگراف جدیدی برخورد کردی از Carriage  Return استفاده کن. باید آئین نگارش را به درستی رعایت کنی، حتی اگر در متن رعایت نشده باشد. باید تمام نیم‌فاصله‌ها را در کلمات رعایت کنی، حتی اگر در متن رعایت نشده باشد.

</div>

##### AI Result

<div dir="rtl">
ناشناخته‌های اجتماع انسانی می‌گشت. او ابعاد مختلف روح، فکر و آگاهی‌های بشری را هم‌پای سیر در سرزمین‌های مختلف، درنوردید. وی در پایان سفرهایش به آمریکا برمی‌گردد و مجدداً در هاروارد به تحصیل می‌پردازد و در رشتهٔ پزشکی موفق به کسب دکترا می‌شود. کرایکتون از سال ۱۳۵۰ به‌طور جدی به نویسندگی رو می‌آورد و اینک یکی از موفق‌ترین نویسندگان آمریکاست.

دربارهٔ کتاب
یکی از بزرگ‌ترین حوادث تاریخ بشریت، جهش اعراب مسلمان از بیابان‌های برهوت مکه و حجاز و فتح دنیای قدیم است. امت عرب به‌رغم کمبود نفرات و ضعیف بودن تجهیزات جنگی در مدتی کوتاه به مبارزه با دو قدرت بزرگ تاریخ، یعنی روم و ایران برخاست و در کمتر از نیم قرن با تسلط بر سرزمین‌های این دو دولت بزرگ، یکی از قوی‌ترین امپراتوری‌های تاریخ بشر را بنا کرد. اسلام که با نعمت مسامحه و آزادی در امر دین وارد کشورها می‌شد و آزادی عقیده و مرام را اعلام می‌کرد، توانست گروهی بی‌شمار مردم را به این دین الهی جذب کند. به دلیل اهمیتی که اسلام و رهبران برجستهٔ آن به کسب علم و دانش می‌دادند و دست عالمان و جویندگان علم باز می‌گذاشتند، بزرگان و متفکرانی در زمینه‌های علم و ادب و هنر در میان مسلمانان برخاستند و توانستند با قدرت دانش و فرهنگ که مؤثرترین سلاح است، در میان جوامع بشری نفوذ کنند و فرهنگ و تمدن بی‌نظیر اسلامی را در جهان بنا کنند.

گسترش اسلام در قرن‌های بعدی که از دورترین سرزمین‌های شرقی تا مرزهای غربی اروپا و شمال و مرکز آفریقا را در بر می‌گرفت، مسلمانان را نیازمند دانش شناخت فرهنگ‌ها، آداب و رسوم و جغرافیای محیط کرد تا بتوانند قوانین و مقررات مناسب برای ادارهٔ امور این سرزمین‌ها وضع کنند. محیط امن و نزدیک شدن ملل مختلف زیر لوای یک دین، عالمان و دانشمندان را به سیر و سفر و جهانگردی برانگیخت و از طرف دیگر حاکمان وقت با اعزام سفیران و فرستاده‌ها به اطراف و اکناف جهان دست به جمع‌آوری اطلاعات دربارهٔ سرزمین‌های همجوار خود زدند تا در جهت تثبیت حاکمیت خود استفاده کنند.
</div>

---

### Read the first page content

In [None]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)

page_count = len(reader.pages)

# Humans are One Based!
# Computer is Zero Based!
my_favorite_page_index: int = 1

if my_favorite_page_index - 1 <= page_count - 1:
    page = reader.pages[my_favorite_page_index - 1]
    text = page.extract_text()
    print(text)




### Read the fifth page content

In [8]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)

page_count = len(reader.pages)
my_favorite_page_index: int = 5

if my_favorite_page_index - 1 <= page_count - 1:
    page = reader.pages[my_favorite_page_index - 1]
    text = page.extract_text()
    print(text)

 
Begin Reading
Table of Contents
About the Author
Copyright Page
 
Thank you for buying this
Flatiron Books ebook.
 
To receive special offers, bonus content,
and info on new releases and other great reads,
sign up for our newsletters.
 
Or visit us online at
us.macmillan.com/newslettersignup
 
For email updates on the author, click here.


### Read the first story (main) page content

In [9]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)

page_count = len(reader.pages)
my_favorite_page_index: int = 10

if my_favorite_page_index - 1 <= page_count - 1:
    page = reader.pages[my_favorite_page_index - 1]
    text = page.extract_text()
    print(text)

 
THE DOOR THAT WASN’T THERE
There was once a rich merchant who lived at the edge of the woods, in a
tiny town in the Hinterland. Though he spent most of his days traveling, he
was at home long enough to give his wife two daughters, the eldest dark
and the youngest golden, born one year apart.
Their father was distant and their mother was strange, often shutting
herself up in her room for hours. Her daughters could hear her speaking to
someone when they pressed their ears to the door, but only the eldest, Anya,
ever made out an answer. The voice she heard was so thin and rustling, she
could almost believe it was leaves against the window.
On a winter’s day when Anya was sixteen, their mother locked her door
and did not open it again. After three days the servants broke it down, and
found—an empty room. The windows were shut, winter howled outside,
and the woman was gone. But she’d left something behind: on the floor, in
a puddle of blood, a bone dagger.
Anya heard the servants whisperi

### Read (Again) the first story (main) page content

In [11]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)

page_count = len(reader.pages)
my_favorite_page_index: int = 10

if my_favorite_page_index - 1 <= page_count - 1:
    page = reader.pages[my_favorite_page_index - 1]
    text = page.extract_text()
    print(repr(text))

'\xa0\nTHE DOOR THAT WASN’T THERE\nThere was once a rich merchant who lived at the edge of the woods, in a\ntiny town in the Hinterland. Though he spent most of his days traveling, he\nwas at home long enough to give his wife two daughters, the eldest dark\nand the youngest golden, born one year apart.\nTheir father was distant and their mother was strange, often shutting\nherself up in her room for hours. Her daughters could hear her speaking to\nsomeone when they pressed their ears to the door, but only the eldest, Anya,\never made out an answer. The voice she heard was so thin and rustling, she\ncould almost believe it was leaves against the window.\nOn a winter’s day when Anya was sixteen, their mother locked her door\nand did not open it again. After three days the servants broke it down, and\nfound—an empty room. The windows were shut, winter howled outside,\nand the woman was gone. But she’d left something behind: on the floor, in\na puddle of blood, a bone dagger.\nAnya heard t

# Read all pages

In [12]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)

for page_index, page in enumerate(reader.pages):
    page = reader.pages[page_index]
    text = page.extract_text()

    print()
    print("=" * 50)
    print(f"Page ({page_index + 1}):")
    print("-" * 50)
    print(text)
    print("=" * 50)


Page (1):
--------------------------------------------------


Page (2):
--------------------------------------------------


Page (3):
--------------------------------------------------


Page (4):
--------------------------------------------------


Page (5):
--------------------------------------------------
 
Begin Reading
Table of Contents
About the Author
Copyright Page
 
Thank you for buying this
Flatiron Books ebook.
 
To receive special offers, bonus content,
and info on new releases and other great reads,
sign up for our newsletters.
 
Or visit us online at
us.macmillan.com/newslettersignup
 
For email updates on the author, click here.

Page (6):
--------------------------------------------------
 
The author and publisher have provided this e-book to you for your
personal use only. You may not make this e-book publicly available in any
way. Copyright infringement is against the law. If you believe the copy
of this e-book you are reading infringes on the author’s copyright,

### Read Some Pages (Range)

In [13]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)
my_favorite_pages_begin_index: int = 5
my_favorite_pages_end_index: int = 10

for page_index in range(my_favorite_pages_begin_index - 1, my_favorite_pages_end_index):
    page = reader.pages[page_index]
    text = page.extract_text()

    print()
    print("=" * 50)
    print(f"Page ({page_index + 1}):")
    print("-" * 50)
    print(text)
    print("=" * 50)


Page (5):
--------------------------------------------------
 
Begin Reading
Table of Contents
About the Author
Copyright Page
 
Thank you for buying this
Flatiron Books ebook.
 
To receive special offers, bonus content,
and info on new releases and other great reads,
sign up for our newsletters.
 
Or visit us online at
us.macmillan.com/newslettersignup
 
For email updates on the author, click here.

Page (6):
--------------------------------------------------
 
The author and publisher have provided this e-book to you for your
personal use only. You may not make this e-book publicly available in any
way. Copyright infringement is against the law. If you believe the copy
of this e-book you are reading infringes on the author’s copyright,
please notify the publisher at: 
us.macmillanusa.com/piracy.

Page (7):
--------------------------------------------------
To all the readers whose first language was fairy tales


Page (8):
--------------------------------------------------


Page (9

### Read again just page 10

##### Step (1)

In [49]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)
my_favorite_pages_begin_index: int = 10
my_favorite_pages_end_index: int = 10

for page_index in range(my_favorite_pages_begin_index - 1, my_favorite_pages_end_index):
    page = reader.pages[page_index]
    text = page.extract_text()

    print()
    print("=" * 50)
    print(f"Page ({page_index + 1}):")
    print("-" * 50)
    print(text)
    print("-" * 50)
    print(repr(text))
    print("=" * 50)


Page (10):
--------------------------------------------------
 
THE DOOR THAT WASN’T THERE
There was once a rich merchant who lived at the edge of the woods, in a
tiny town in the Hinterland. Though he spent most of his days traveling, he
was at home long enough to give his wife two daughters, the eldest dark
and the youngest golden, born one year apart.
Their father was distant and their mother was strange, often shutting
herself up in her room for hours. Her daughters could hear her speaking to
someone when they pressed their ears to the door, but only the eldest, Anya,
ever made out an answer. The voice she heard was so thin and rustling, she
could almost believe it was leaves against the window.
On a winter’s day when Anya was sixteen, their mother locked her door
and did not open it again. After three days the servants broke it down, and
found—an empty room. The windows were shut, winter howled outside,
and the woman was gone. But she’d left something behind: on the floor, in
a p

##### Step (2)

In [None]:
from pypdf import PdfReader

pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)
my_favorite_pages_begin_index: int = 10
my_favorite_pages_end_index: int = 10

for page_index in range(my_favorite_pages_begin_index - 1, my_favorite_pages_end_index):
    page = reader.pages[page_index]

    original_text: str = page.extract_text().strip()

    text: str = original_text
    text = text.replace(".\n", ".[NEW_LINE]")
    text = text.replace("!\n", "![NEW_LINE]")
    text = text.replace("?\n", "?[NEW_LINE]")

    text = text.replace("\n", " ")

    text = text.replace("[NEW_LINE]", "\n\n")

    print("-" * 50)
    print(text)
    print("-" * 50)
    print(repr(original_text))
    print("-" * 50)

--------------------------------------------------
THE DOOR THAT WASN’T THERE There was once a rich merchant who lived at the edge of the woods, in a tiny town in the Hinterland. Though he spent most of his days traveling, he was at home long enough to give his wife two daughters, the eldest dark and the youngest golden, born one year apart.

Their father was distant and their mother was strange, often shutting herself up in her room for hours. Her daughters could hear her speaking to someone when they pressed their ears to the door, but only the eldest, Anya, ever made out an answer. The voice she heard was so thin and rustling, she could almost believe it was leaves against the window.

On a winter’s day when Anya was sixteen, their mother locked her door and did not open it again. After three days the servants broke it down, and found—an empty room. The windows were shut, winter howled outside, and the woman was gone. But she’d left something behind: on the floor, in a puddle of blo

##### Step (3)

In [55]:
from pypdf import PdfReader


# NEW
def fix_paragraph(text: str) -> str:
    """
    Fix Paragraph
    """

    if not text:
        return ""

    text = text.strip()

    while "  " in text:
        text = text.replace("  ", " ")

    return text


pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)
my_favorite_pages_begin_index: int = 10
my_favorite_pages_end_index: int = 10

# NEW
all_paragraphs: list[str] = []

for page_index in range(my_favorite_pages_begin_index - 1, my_favorite_pages_end_index):
    page = reader.pages[page_index]

    original_text: str = page.extract_text().strip()

    text: str = original_text
    text = text.replace(".\n", ".[NEW_LINE]")
    text = text.replace("!\n", "![NEW_LINE]")
    text = text.replace("?\n", "?[NEW_LINE]")

    text = text.replace("\n", " ")

    # NEW
    page_paragraphs: list[str] = text.split("[NEW_LINE]")

    # NEW
    for page_paragraph in page_paragraphs:
        page_paragraph = fix_paragraph(text=page_paragraph)
        if page_paragraph:
            all_paragraphs.append(page_paragraph)

print("-" * 50)
for paragraph in all_paragraphs:
    print("    ", paragraph)
    print()
print("-" * 50)

--------------------------------------------------
     THE DOOR THAT WASN’T THERE There was once a rich merchant who lived at the edge of the woods, in a tiny town in the Hinterland. Though he spent most of his days traveling, he was at home long enough to give his wife two daughters, the eldest dark and the youngest golden, born one year apart.

     Their father was distant and their mother was strange, often shutting herself up in her room for hours. Her daughters could hear her speaking to someone when they pressed their ears to the door, but only the eldest, Anya, ever made out an answer. The voice she heard was so thin and rustling, she could almost believe it was leaves against the window.

     On a winter’s day when Anya was sixteen, their mother locked her door and did not open it again. After three days the servants broke it down, and found—an empty room. The windows were shut, winter howled outside, and the woman was gone. But she’d left something behind: on the floor, in 

##### Step (4)

In [58]:
from pypdf import PdfReader


# NEW
def fix_paragraph(text: str) -> str:
    """
    Fix Paragraph
    """

    if not text:
        return ""

    text = text.strip()

    while "  " in text:
        text = text.replace("  ", " ")

    return text


pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)

# NEW
my_favorite_pages_begin_index: int = 10
my_favorite_pages_end_index: int = 16

all_paragraphs: list[str] = []

for page_index in range(my_favorite_pages_begin_index - 1, my_favorite_pages_end_index):
    page = reader.pages[page_index]

    original_text: str = page.extract_text().strip()

    text: str = original_text
    text = text.replace(".\n", ".[NEW_LINE]")
    text = text.replace("!\n", "![NEW_LINE]")
    text = text.replace("?\n", "?[NEW_LINE]")

    text = text.replace("\n", " ")

    page_paragraphs: list[str] = text.split("[NEW_LINE]")

    for page_paragraph in page_paragraphs:
        page_paragraph = fix_paragraph(text=page_paragraph)
        if page_paragraph:
            all_paragraphs.append(page_paragraph)

print("-" * 50)
for paragraph in all_paragraphs:
    print("    ", paragraph)
    print()
print("-" * 50)

--------------------------------------------------
     THE DOOR THAT WASN’T THERE There was once a rich merchant who lived at the edge of the woods, in a tiny town in the Hinterland. Though he spent most of his days traveling, he was at home long enough to give his wife two daughters, the eldest dark and the youngest golden, born one year apart.

     Their father was distant and their mother was strange, often shutting herself up in her room for hours. Her daughters could hear her speaking to someone when they pressed their ears to the door, but only the eldest, Anya, ever made out an answer. The voice she heard was so thin and rustling, she could almost believe it was leaves against the window.

     On a winter’s day when Anya was sixteen, their mother locked her door and did not open it again. After three days the servants broke it down, and found—an empty room. The windows were shut, winter howled outside, and the woman was gone. But she’d left something behind: on the floor, in 

##### Step (5)

In [59]:
from pypdf import PdfReader


def fix_paragraph(text: str) -> str:
    """
    Fix Paragraph
    """

    if not text:
        return ""

    text = text.strip()

    while "  " in text:
        text = text.replace("  ", " ")

    return text


pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)

my_favorite_pages_begin_index: int = 10
my_favorite_pages_end_index: int = 16

# NEW
last_paragraph: str = ""
all_paragraphs: list[str] = []

for page_index in range(my_favorite_pages_begin_index - 1, my_favorite_pages_end_index):
    page = reader.pages[page_index]

    original_text: str = page.extract_text().strip()

    text: str = original_text
    text = text.replace(".\n", ".[NEW_LINE]")
    text = text.replace("!\n", "![NEW_LINE]")
    text = text.replace("?\n", "?[NEW_LINE]")

    text = text.replace("\n", " ")

    page_paragraphs: list[str] = text.split("[NEW_LINE]")

    for page_paragraph in page_paragraphs:
        page_paragraph = fix_paragraph(text=page_paragraph)
        if page_paragraph:
            if last_paragraph and last_paragraph[-1] not in [".", "!", "?"]:
                all_paragraphs[-1] = f"{last_paragraph} {page_paragraph}"
            else:
                all_paragraphs.append(page_paragraph)

            last_paragraph = all_paragraphs[-1]

print("-" * 50)
for paragraph in all_paragraphs:
    print("    ", paragraph)
    print()
print("-" * 50)

--------------------------------------------------
     THE DOOR THAT WASN’T THERE There was once a rich merchant who lived at the edge of the woods, in a tiny town in the Hinterland. Though he spent most of his days traveling, he was at home long enough to give his wife two daughters, the eldest dark and the youngest golden, born one year apart.

     Their father was distant and their mother was strange, often shutting herself up in her room for hours. Her daughters could hear her speaking to someone when they pressed their ears to the door, but only the eldest, Anya, ever made out an answer. The voice she heard was so thin and rustling, she could almost believe it was leaves against the window.

     On a winter’s day when Anya was sixteen, their mother locked her door and did not open it again. After three days the servants broke it down, and found—an empty room. The windows were shut, winter howled outside, and the woman was gone. But she’d left something behind: on the floor, in 

##### Step (6) - Save to File

In [60]:
from pypdf import PdfReader


def fix_paragraph(text: str) -> str:
    """
    Fix Paragraph
    """

    if not text:
        return ""

    text = text.strip()

    while "  " in text:
        text = text.replace("  ", " ")

    return text


pdf_file_path: str = "./pdf_files/sample_01.pdf"

reader = PdfReader(stream=pdf_file_path)

my_favorite_pages_begin_index: int = 10
my_favorite_pages_end_index: int = 16

# NEW
last_paragraph: str = ""
all_paragraphs: list[str] = []

for page_index in range(my_favorite_pages_begin_index - 1, my_favorite_pages_end_index):
    page = reader.pages[page_index]

    original_text: str = page.extract_text().strip()

    text: str = original_text
    text = text.replace(".\n", ".[NEW_LINE]")
    text = text.replace("!\n", "![NEW_LINE]")
    text = text.replace("?\n", "?[NEW_LINE]")

    text = text.replace("\n", " ")

    page_paragraphs: list[str] = text.split("[NEW_LINE]")

    for page_paragraph in page_paragraphs:
        page_paragraph = fix_paragraph(text=page_paragraph)
        if page_paragraph:
            if last_paragraph and last_paragraph[-1] not in [".", "!", "?"]:
                all_paragraphs[-1] = f"{last_paragraph} {page_paragraph}"
            else:
                all_paragraphs.append(page_paragraph)

            last_paragraph = all_paragraphs[-1]

target_file_path: str = "./txt_files/sample_01.txt"
with open(file=target_file_path, mode="wt", encoding="utf-8") as file:
    for paragraph in all_paragraphs:
        file.write(f"{paragraph}")
        file.write("\n\n")

print("Finished.")

Finished.
