In [5]:
import os
import json
import glob
import fitz  # PyMuPDF

def extract_pages(input_path, output_path, start_page, end_page):
    """
    Extracts a range of pages from a PDF and saves them to a new file.

    Args:
        input_path (str): Path to the source PDF.
        output_path (str): Path to save the new PDF.
        start_page (int): The first page to include (1-based index).
        end_page (int): The last page to include (1-based index, inclusive).
    """
    try:
        src_doc = fitz.open(input_path)
        out_doc = fitz.open()
        total_pages = len(src_doc)
        
        if start_page < 1 or start_page > total_pages:
            print(f"Error: Start page {start_page} is out of range (1-{total_pages}).")
            return
            
        if end_page < start_page or end_page > total_pages:
            print(f"Error: End page {end_page} is invalid.")
            return
        # insert_pdf copies pages from src_doc to out_doc.
        # Note: from_page and to_page are inclusive and now 0-based.
        out_doc.insert_pdf(src_doc, from_page=start_page - 1, to_page=end_page - 1)
        out_doc.save(output_path)
        print(f"Successfully created '{output_path}' with pages {start_page} to {end_page}.")

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        if 'src_doc' in locals(): src_doc.close()
        if 'out_doc' in locals(): out_doc.close()

In [2]:
with open("./data/Elixir_in_Action_Third_Edition.json", "r") as f:
    ElixirInAction = json.loads(f.read())

In [3]:
for part, chapters in ElixirInAction["text"].items():
    chapter = ElixirInAction["text"][part]["chapters"]
    for chapter_no, chapter_details in chapter.items():
        start_page, end_page = chapter_details["pages"]
        
        input_pdf_path = "./data/Elixir_in_Action_Third_Edition.pdf"
        output_pdf_path = f'./data/PDF/{part.upper()}/Chapter {chapter_no}.pdf'
        os.makedirs(os.path.dirname(output_pdf_path), exist_ok=True)
        extract_pages(input_pdf_path, output_pdf_path, start_page, end_page)
    
    

Successfully created './data/PDF/INTRODUCTION/Chapter 0.pdf' with pages 1 to 25.
Successfully created './data/PDF/PART_1/Chapter 1.pdf' with pages 26 to 43.
Successfully created './data/PDF/PART_1/Chapter 2.pdf' with pages 44 to 93.
Successfully created './data/PDF/PART_1/Chapter 3.pdf' with pages 94 to 136.
Successfully created './data/PDF/PART_1/Chapter 4.pdf' with pages 137 to 163.
Successfully created './data/PDF/PART_2/Chapter 5.pdf' with pages 164 to 197.
Successfully created './data/PDF/PART_2/Chapter 6.pdf' with pages 198 to 216.
Successfully created './data/PDF/PART_2/Chapter 7.pdf' with pages 217 to 239.
Successfully created './data/PDF/PART_2/Chapter 8.pdf' with pages 240 to 263.
Successfully created './data/PDF/PART_2/Chapter 9.pdf' with pages 264 to 291.
Successfully created './data/PDF/PART_2/Chapter 10.pdf' with pages 292 to 319.
Successfully created './data/PDF/PART_3/Chapter 11.pdf' with pages 320 to 349.
Successfully created './data/PDF/PART_3/Chapter 12.pdf' with pag

In [None]:
pdf_dir = "./data/PDF/"

pattern = os.path.join(pdf_dir, '**', '*.pdf')
pdf_files = glob.glob(pattern, recursive=True)

for pdf in pdf_files:
    src_doc = fitz.open(pdf)
    full_text = ""
    for page in src_doc:
        full_text += page.get_text()
    src_doc.close()
    txt_path = pdf.replace("PDF", "TXT", 1).replace(".pdf", ".txt", 1)
    os.makedirs(os.path.dirname(txt_path), exist_ok=True)
    with open(txt_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(full_text)
    print(f"Extracted text from '{pdf}' to '{txt_path}'.")


Extracted text from './data/PDF/INTRODUCTION/Chapter 0.pdf' to './data/TXT/INTRODUCTION/Chapter 0.txt'.
Extracted text from './data/PDF/PART_2/Chapter 7.pdf' to './data/TXT/PART_2/Chapter 7.txt'.
Extracted text from './data/PDF/PART_2/Chapter 6.pdf' to './data/TXT/PART_2/Chapter 6.txt'.
Extracted text from './data/PDF/PART_2/Chapter 5.pdf' to './data/TXT/PART_2/Chapter 5.txt'.
Extracted text from './data/PDF/PART_2/Chapter 10.pdf' to './data/TXT/PART_2/Chapter 10.txt'.
Extracted text from './data/PDF/PART_2/Chapter 8.pdf' to './data/TXT/PART_2/Chapter 8.txt'.
Extracted text from './data/PDF/PART_2/Chapter 9.pdf' to './data/TXT/PART_2/Chapter 9.txt'.
Extracted text from './data/PDF/PART_3/Chapter 11.pdf' to './data/TXT/PART_3/Chapter 11.txt'.
Extracted text from './data/PDF/PART_3/Chapter 12.pdf' to './data/TXT/PART_3/Chapter 12.txt'.
Extracted text from './data/PDF/PART_3/Chapter 13.pdf' to './data/TXT/PART_3/Chapter 13.txt'.
Extracted text from './data/PDF/INDEX/Chapter 14.pdf' to './

In [None]:
# Test DB Connection Works

import psycopg2
from psycopg2 import OperationalError

def check_db_connection():
    try:
        # Connect using the credentials from your docker-compose
        connection = psycopg2.connect(
            database="elixir_devcontainer_dev",
            user="postgres",
            password="postgres",
            host="db",
            port="5432"
        )
        
        print("✅ Success! Connected to the database.")
        
        # Optional: Print database version to verify interaction
        cursor = connection.cursor()
        cursor.execute("SELECT version();")
        record = cursor.fetchone()
        print(f"   Server version: {record[0]}")
        
        cursor.close()
        connection.close()
        return True

    except OperationalError as e:
        print("❌ Connection failed.")
        print(f"   Error: {e}")
        return False

check_db_connection()

✅ Success! Connected to the database.
   Server version: PostgreSQL 16.11 on aarch64-unknown-linux-musl, compiled by gcc (Alpine 15.2.0) 15.2.0, 64-bit


True