In [18]:
import fitz
import re

In [77]:
# Dictionary mapping example keys to PDF paths
examples = {
    "pdf_path1": "../data/mcelreath_2020_statistical-rethinking.pdf",
    "pdf_path2": "../data/Theory of Statistic.pdf",
    "pdf_path3": "../data/Deep Learning with Python.pdf",
    "pdf_path4": "../data/Natural_Image_Statistics.pdf",
    "pdf_path5": "../data/mml-book.pdf"
}

# Dictionary mapping example keys to page ranges to extract content from
content_page_ranges = {
    "pdf_path1": range(5, 8),
    "pdf_path2": range(10, 17),
    "pdf_path3": range(7, 13),
    "pdf_path4": range(4, 13),
    "pdf_path5": range(2, 5),
}

# Select example number
n_example = 5
key = f"pdf_path{n_example}"

# Open the PDF
doc = fitz.open(examples[key])

# Extract text from the specified page range
chapters_content_list = []
for page_num in content_page_ranges[key]:
    page = doc[page_num]
    text = page.get_text("text")
    chapters_content_list.append(text)

# Join all text pages into a single string if needed
chapters_content = "\n".join(chapters_content_list)

print(chapters_content)  # or pass it to your model

Contents
Foreword
1
Part I
Mathematical Foundations
9
1
Introduction and Motivation
11
1.1
Finding Words for Intuitions
12
1.2
Two Ways to Read This Book
13
1.3
Exercises and Feedback
16
2
Linear Algebra
17
2.1
Systems of Linear Equations
19
2.2
Matrices
22
2.3
Solving Systems of Linear Equations
27
2.4
Vector Spaces
35
2.5
Linear Independence
40
2.6
Basis and Rank
44
2.7
Linear Mappings
48
2.8
Afﬁne Spaces
61
2.9
Further Reading
63
Exercises
64
3
Analytic Geometry
70
3.1
Norms
71
3.2
Inner Products
72
3.3
Lengths and Distances
75
3.4
Angles and Orthogonality
76
3.5
Orthonormal Basis
78
3.6
Orthogonal Complement
79
3.7
Inner Product of Functions
80
3.8
Orthogonal Projections
81
3.9
Rotations
91
3.10
Further Reading
94
Exercises
96
4
Matrix Decompositions
98
4.1
Determinant and Trace
99
i
This material will be published by Cambridge University Press as Mathematics for Machine Learn-
ing by Marc Peter Deisenroth, A. Aldo Faisal, and Cheng Soon Ong. This pre-publication version is
free to

In [83]:

def extract_font_info(pdf_path, header_margin=70, footer_margin=100):
    doc = fitz.open(pdf_path)
    font_data = []
    
    for page_num in content_page_ranges[key]:
        page = doc.load_page(page_num)
        page_height = page.rect.height
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        y = span["origin"][1]
                        # Skip headers/footers using defaults
                        if y < header_margin or y > (page_height - footer_margin):
                            continue
                        font_data.append({
                            "text": span["text"],
                            # "font_name": span["font"],
                            # "font_size": span["size"],
                            # "color": span["color"],  # RGB tuple (e.g., (0, 0, 0) for black)
                            # "is_bold": "bold" in span["font"].lower(),
                            # "is_italic": "italic" in span["font"].lower(),
                            "page": page_num + 1,
                            "coordinates": (span["origin"][0], span["origin"][1])
                        })
    return font_data


def extract_lines_from_font_info(font_info):
    """
    Extracts lines of text from font information based on y-coordinates.
    This function assumes that text elements with the same y-coordinate belong to the same line.
    """
    if not font_info:
        return []
    lines = []
    prev_y = None
    cur_line = ""

    for element in font_info:
        cur_y = element['coordinates'][1]
        if prev_y is None or cur_y == prev_y:
            cur_line += " " + element['text']
        else:
            if cur_line.strip():
                lines.append(cur_line.strip())
            cur_line = element['text']
        prev_y = cur_y

    # Don't forget the last line
    if cur_line.strip():
        lines.append(cur_line.strip())

    return lines


class TextCleaner:
    def __init__(self):
        self.patterns = {
            'numbered_lines': re.compile(r'^\d+\.\d+\b'),
            'symbol_only': re.compile(r'^[\W_]+$'),
            'copyright_pattern': re.compile(r'(©|ⓒ|\(c\)|\(C\)|c\s*⃝)', re.IGNORECASE),
            'exercises_pattern': re.compile(r'^\s*Exercises?\b[\s\d.:!?-]*$', re.IGNORECASE),
            'dotted_noise': re.compile(r'(?<!\w)([.\s]){3,}(?!\w)'),  
            'symbol_noise': re.compile(r'(?<!\w)([\W]\s?){3,}(?!\w)')
            }

    def filter_lines(self, lines):
        """Remove unwanted lines while keeping the structure"""
        return [
            line for line in lines
            if not (self.patterns['numbered_lines'].match(line.strip()) or 
                   self.patterns['symbol_only'].match(line.strip()) or
                   self.patterns['copyright_pattern'].search(line.strip()) or
                   self.patterns['exercises_pattern'].match(line.strip())) 
        ]

    def filter_noise(self, lines):
        """Remove noise patterns from lines"""
        cleaned = []
        for line in lines:
            # Remove standalone noise sequences (not between words)
            line = self.patterns['dotted_noise'].sub('', line)
            line = self.patterns['symbol_noise'].sub('', line)
            cleaned.append(line.strip())
        return cleaned
    
    def process(self, lines):
        """Complete processing pipeline"""
        filtered = self.filter_lines(lines)
        cleaned = self.filter_noise(filtered)
        return cleaned

In [84]:
font_info = extract_font_info(examples[key])
lines = extract_lines_from_font_info(font_info)
cleaner = TextCleaner()
processed_lines = cleaner.process(lines)

for line in processed_lines:
    print(line)

Contents
Foreword 1
Part I Mathematical Foundations 9
1 Introduction and Motivation 11
2 Linear Algebra 17
3 Analytic Geometry 70
4 Matrix Decompositions 98
i
This material will be published by Cambridge University Press as  Mathematics for Machine Learn-
ii Contents
5 Vector Calculus 139
6 Probability and Distributions 172
7 Continuous Optimization 225
Part II Central Machine Learning Problems 249
8 When Models Meet Data 251
Draft (2019-12-11) of “Mathematics for Machine Learning”. Feedback:  https://mml-book.com .
Contents iii
9 Linear Regression 289
10 Dimensionality Reduction with Principal Component Analysis 317
11 Density Estimation with Gaussian Mixture Models 348
12 Classiﬁcation with Support Vector Machines 370
References 395
Index 407
c
⃝ 2019 M. P. Deisenroth, A. A. Faisal, C. S. Ong. To be published by Cambridge University Press.
