In [18]:
import fitz
import re

In [None]:
# Dictionary mapping example keys to PDF paths
examples = {
    "pdf_path1": "../data/mcelreath_2020_statistical-rethinking.pdf",
    "pdf_path2": "../data/Theory of Statistic.pdf",
    "pdf_path3": "../data/Deep Learning with Python.pdf",
    "pdf_path4": "../data/Natural_Image_Statistics.pdf",
    "pdf_path5": "../data/mml-book.pdf"
}

# Dictionary mapping example keys to page ranges to extract content from
content_page_ranges = {
    "pdf_path1": range(5, 8),
    "pdf_path2": range(10, 17),
    "pdf_path3": range(7, 13),
    "pdf_path4": range(4, 13),
    "pdf_path5": range(2, 5),
}

# Select example number
n_example = 4
key = f"pdf_path{n_example}"

# Open the PDF
doc = fitz.open(examples[key])

# Extract text from the specified page range
chapters_content_list = []
for page_num in content_page_ranges[key]:
    page = doc[page_num]
    text = page.get_text("text")
    chapters_content_list.append(text)

# Join all text pages into a single string if needed
chapters_content = "\n".join(chapters_content_list)

print(chapters_content)  # or pass it to your model

Contents
1
Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
1
1.1
What this book is all about . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
1
1.2
What is vision? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
3
1.3
The magic of your visual system . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
3
1.4
Importance of prior information . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
6
1.4.1
Ecological adaptation provides prior information . . . . . . . . . .
6
1.4.2
Generative models and latent quantities . . . . . . . . . . . . . . . . . .
8
1.4.3
Projection onto the retina loses information . . . . . . . . . . . . . .
9
1.4.4
Bayesian inference and priors . . . . . . . . . . . . . . . . . . . . . . . . . . 10
1.5
Natural images . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11
1.5.1
The image s

In [None]:

def extract_font_info(pdf_path, header_margin=70, footer_margin=100):
    doc = fitz.open(pdf_path)
    font_data = []
    
    for page_num in content_page_ranges[key]:
        page = doc.load_page(page_num)
        page_height = page.rect.height
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        y = span["origin"][1]
                        # Skip headers/footers using defaults
                        if y < header_margin or y > (page_height - footer_margin):
                            continue
                        font_data.append({
                            "text": span["text"],
                            # "font_name": span["font"],
                            # "font_size": span["size"],
                            # "color": span["color"],  # RGB tuple (e.g., (0, 0, 0) for black)
                            # "is_bold": "bold" in span["font"].lower(),
                            # "is_italic": "italic" in span["font"].lower(),
                            "page": page_num + 1,
                            "coordinates": (span["origin"][0], span["origin"][1])
                        })
    return font_data


def extract_lines_from_font_info(font_info):
    """
    Extracts lines of text from font information based on y-coordinates.
    This function assumes that text elements with the same y-coordinate belong to the same line.
    """
    if not font_info:
        return []
    lines = []
    prev_y = None
    cur_line = ""

    for element in font_info:
        cur_y = element['coordinates'][1]
        if prev_y is None or cur_y == prev_y:
            cur_line += " " + element['text']
        else:
            if cur_line.strip():
                lines.append(cur_line.strip())
            cur_line = element['text']
        prev_y = cur_y

    # Don't forget the last line
    if cur_line.strip():
        lines.append(cur_line.strip())

    return lines


class TextCleaner:
    def __init__(self):
        self.patterns = {
            # patterns to filter out unwanted lines
            'numbered_lines': re.compile(r'^\d+\.\d+\b'),
            'symbol_only': re.compile(r'^[\W_]+$'),
            'copyright_pattern': re.compile(r'(©|ⓒ|\(c\)|\(C\)|c\s*⃝)', re.IGNORECASE),
            'exercises_pattern': re.compile(r'^\s*Exercises?\b[\s\d.:!?-]*$', re.IGNORECASE),
            # noise patterns
            'dotted_noise': re.compile(r'(?<!\w)([.\s]){3,}(?!\w)'),  
            'symbol_noise': re.compile(r'(?<!\w)([\W]\s?){3,}(?!\w)')
            }

    def filter_lines(self, lines):
        """Remove unwanted lines while keeping the structure"""
        return [
            line for line in lines
            if not (self.patterns['numbered_lines'].match(line.strip()) or 
                   self.patterns['symbol_only'].match(line.strip()) or
                   self.patterns['copyright_pattern'].search(line.strip()) or
                   self.patterns['exercises_pattern'].match(line.strip())) 
        ]

    def filter_noise(self, lines):
        """Remove noise patterns from lines"""
        cleaned = []
        for line in lines:
            # Remove standalone noise sequences (not between words)
            line = self.patterns['dotted_noise'].sub('', line)
            line = self.patterns['symbol_noise'].sub('', line)
            cleaned.append(line.strip())
        return cleaned
    
    def process(self, lines):
        """Complete processing pipeline"""
        filtered = self.filter_lines(lines)
        cleaned = self.filter_noise(filtered)
        return cleaned

In [None]:
# font_info = extract_font_info(examples[key])
# lines = extract_lines_from_font_info(font_info)
# cleaner = TextCleaner()
# processed_lines = cleaner.process(lines)

# for line in processed_lines:
#     print(line)

Contents
Foreword 1
Part I Mathematical Foundations 9
1 Introduction and Motivation 11
2 Linear Algebra 17
3 Analytic Geometry 70
4 Matrix Decompositions 98
i
This material will be published by Cambridge University Press as  Mathematics for Machine Learn-
ii Contents
5 Vector Calculus 139
6 Probability and Distributions 172
7 Continuous Optimization 225
Part II Central Machine Learning Problems 249
8 When Models Meet Data 251
Draft (2019-12-11) of “Mathematics for Machine Learning”. Feedback:  https://mml-book.com .
Contents iii
9 Linear Regression 289
10 Dimensionality Reduction with Principal Component Analysis 317
11 Density Estimation with Gaussian Mixture Models 348
12 Classiﬁcation with Support Vector Machines 370
References 395
Index 407
c
⃝ 2019 M. P. Deisenroth, A. A. Faisal, C. S. Ong. To be published by Cambridge University Press.


### Test runpod output

In [89]:
import os
import time
import requests
from dotenv import load_dotenv
import json
import codecs
from pathlib import Path

# Load .env from project root
load_dotenv()

API_KEY = os.getenv("RUNPOD_API_KEY")
ENDPOINT = os.getenv("RUNPOD_ENDPOINT")

HEADERS = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

In [90]:
def run_prompt(prompt: str, max_tokens: int = 256, context_length: int = 8192) -> str:
    """Submit a prompt to the RunPod endpoint and get back a response string."""
    payload = {
        "input": {
            "prompt": prompt,
            "options": {
                "num_ctx": context_length,      # Context window size
                "num_predict": max_tokens       # Max tokens to generate
            }
        }
    }

    # Start job
    response = requests.post(f"{ENDPOINT}/run", headers=HEADERS, json=payload)
    job_id = response.json().get("id")
    print(f"[RunPod] Job started: {job_id}")

    # Poll for status
    while True:
        status_res = requests.get(f"{ENDPOINT}/status/{job_id}", headers=HEADERS).json()
        status = status_res.get("status")
        print(f"[RunPod] Status: {status}")
        if status in ("COMPLETED", "FAILED"):
            break
        time.sleep(3)

    if status == "COMPLETED":
        return status_res["output"]["response"]
    else:
        raise RuntimeError("RunPod job failed.")


def clean_and_parse_json(raw_text: str):
    """Clean and parse model output into JSON."""
    cleaned = raw_text.strip().strip("```json").strip("```").strip("'")
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        try:
            # Handle escaped quotes
            unescaped = codecs.decode(cleaned, 'unicode_escape')
            return json.loads(unescaped)
        except Exception as e:
            raise ValueError("Could not parse JSON output") from e
        


In [143]:
def toc_prompt_new(toc_text: str):
    prompt = f"""<start_of_turn>user
You are a precise document parser. Extract main chapters from this table of contents.

RULES:
1. Extract ONLY numbered chapters (1, 2, 3...), NOT subsections (1.1, 1.2...)
2. Use EXACT titles and page numbers from the document
3. Return ONLY valid JSON array, no explanations
4. If unclear, return empty array []

Expected format:
[{{"chapter_number": "1", "chapter_title": "Introduction", "start_page": 1, "end_page": 25}}]

Table of contents:
{toc_text}

Return JSON only:<end_of_turn>
<start_of_turn>model
["""
    return prompt


def toc_prompt2(toc_text: str):
    prompt = f"""<start_of_turn>user
You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text.

I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections.

Here is the table of contents:
{toc_text}

CRITICAL RULES:
1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
3. Use the EXACT chapter titles shown in the document
4. Use the EXACT page numbers shown in the document. The starting page of a chapter is the page where the chapter starts, and the end page is the page before the next chapter starts.
5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
6. Calculate end pages as: next chapter's start page minus 1
7. Return ONLY valid JSON - no explanations, no markdown formatting
8. If you cannot clearly identify chapters, return empty array []
9. Exclude chapters starting with 0

Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]<end_of_turn>
<start_of_turn>model
I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers.

Looking at the provided table of contents, I can identify the following main chapters:

[<end_of_turn>
<start_of_turn>user
Continue with the complete JSON array.<end_of_turn>
<start_of_turn>model
"""
    return prompt


def toc_prompt3(toc_text: str):
    prompt = f"""<start_of_turn>user
You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text.

I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections.

Here is the table of contents:
{toc_text}

CRITICAL RULES:
CRITICAL RULES:
1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
3. Use the EXACT chapter titles shown in the document
4. Page numbers MUST be calculated as follows:
   - The starting page is ALWAYS the exact page number shown for the chapter
   - The end page is ALWAYS calculated as: (next chapter's start page) minus 1
   - If there is no next chapter, leave end_page as null
5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
6. Return ONLY valid JSON - no explanations, no markdown formatting
7. If you cannot clearly identify chapters, return empty array []
8. Exclude chapters starting with 0
9. DOUBLE-CHECK your end_page calculations based on the next chapter's start page

Return JSON array in this exact format:
[{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X|null}}]<end_of_turn>
<start_of_turn>model
I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will calculate end pages precisely as (next chapter's start page) minus 1.

Looking at the provided table of contents, I can identify the following main chapters:

[<end_of_turn>
<start_of_turn>user
Continue with the complete JSON array, ensuring end_page is always calculated correctly based on the next chapter's start page.<end_of_turn>
<start_of_turn>model
"""
    return prompt

In [147]:
# Dictionary mapping example keys to PDF paths
examples = {
    "pdf_path1": "../data/mcelreath_2020_statistical-rethinking.pdf",
    "pdf_path2": "../data/Theory of Statistic.pdf",
    "pdf_path3": "../data/Deep Learning with Python.pdf",
    "pdf_path4": "../data/Natural_Image_Statistics.pdf",
    "pdf_path5": "../data/mml-book.pdf",
    "pdf_path6": "../data/Python-Testing-with-pytest-Simple,-Rapid,-Effective,-and-Scalable,-2nd-Edition-by-Brian-Okken_bibis.ir.pdf",
    "pdf_path7": "../data/Marcos Lopez de Prado - Advances in Financial Machine Learning-Wiley (2018).pdf",
    "pdf_path8": "../data/finetuning.pdf",
}

# Dictionary mapping example keys to page ranges to extract content from
content_page_ranges = {
    "pdf_path1": range(5, 8),
    "pdf_path2": range(10, 17),
    "pdf_path3": range(7, 13),
    "pdf_path4": range(4, 13),
    "pdf_path5": range(2, 5),
    "pdf_path6": range(6, 10),
    "pdf_path7": range(13, 23),
    "pdf_path8": range(4, 9),
}

# Select example number
n_example = 7
key = f"pdf_path{n_example}"

# Open the PDF
doc = fitz.open(examples[key])

# Extract text from the specified page range
chapters_content_list = []
for page_num in content_page_ranges[key]:
    page = doc[page_num]
    text = page.get_text("text")
    chapters_content_list.append(text)

# Join all text pages into a single string if needed
chapters_content = "\n".join(chapters_content_list)

print(chapters_content)  # or pass it to your model

Contents
About the Author
xxi
PREAMBLE
1
1
Financial Machine Learning as a Distinct Subject
3
1.1
Motivation, 3
1.2
The Main Reason Financial Machine Learning Projects Usually Fail, 4
1.2.1
The Sisyphus Paradigm, 4
1.2.2
The Meta-Strategy Paradigm, 5
1.3
Book Structure, 6
1.3.1
Structure by Production Chain, 6
1.3.2
Structure by Strategy Component, 9
1.3.3
Structure by Common Pitfall, 12
1.4
Target Audience, 12
1.5
Requisites, 13
1.6
FAQs, 14
1.7
Acknowledgments, 18
Exercises, 19
References, 20
Bibliography, 20
PART 1
DATA ANALYSIS
21
2
Financial Data Structures
23
2.1
Motivation, 23
ix

x
CONTENTS
2.2
Essential Types of Financial Data, 23
2.2.1
Fundamental Data, 23
2.2.2
Market Data, 24
2.2.3
Analytics, 25
2.2.4
Alternative Data, 25
2.3
Bars, 25
2.3.1
Standard Bars, 26
2.3.2
Information-Driven Bars, 29
2.4
Dealing with Multi-Product Series, 32
2.4.1
The ETF Trick, 33
2.4.2
PCA Weights, 35
2.4.3
Single Future Roll, 36
2.5
Sampling Features, 38
2.5.1
Sampling for Reduction, 38
2.5.2
Eve

In [148]:
font_info = extract_font_info(examples[key])
lines = extract_lines_from_font_info(font_info)
cleaner = TextCleaner()
processed_lines = cleaner.process(lines)

for line in processed_lines:
    print(line)

Contents
About the Author xxi
PREAMBLE 1
1 Financial Machine Learning as a Distinct Subject 3
Exercises ,  19
References ,  20
Bibliography ,  20
PART 1 DATA ANALYSIS 21
Exercises ,  40
References ,  41
3 Labeling 43
Exercises ,  55
Bibliography ,  56
4 Sample Weights 59
Exercises ,  72
References ,  73
Bibliography ,  73
5 Fractionally Differentiated Features 75
Exercises ,  88
References ,  89
Bibliography ,  89
PART 2 MODELLING 91
6 Ensemble Methods 93
Exercises ,  101
References ,  102
Bibliography ,  102
7 Cross-Validation in Finance 103
Exercises ,  110
Bibliography ,  111
8 Feature Importance 113
Exercises ,  127
References ,  127
9 Hyper-Parameter Tuning with Cross-Validation 129
Exercises ,  135
References ,  136
Bibliography ,  137
PART 3 BACKTESTING 139
10 Bet Sizing 141
Exercises ,  148
References ,  149
Bibliography ,  149
11 The Dangers of Backtesting 151
Exercises ,  158
References ,  158
Bibliography ,  159
12 Backtesting through Cross-Validation 161
Backtest Overfittin

In [149]:
# steb by step call
processed_lines_str = "\n".join(processed_lines)
toc_prompt_text = toc_prompt3(processed_lines_str)
raw_output = result = run_prompt(
    prompt=toc_prompt_text,
    max_tokens=1024, #1024,        # Enough for JSON array
    context_length=10000 #16384    # Large context for long TOC
)
json_output = clean_and_parse_json(raw_output)

[RunPod] Job started: 2d17d148-4689-4963-ae76-cf58f3039a00-e2
[RunPod] Status: IN_QUEUE
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: COMPLETED


In [150]:
json_output

[{'chapter_number': '1',
  'chapter_title': 'Financial Machine Learning as a Distinct Subject',
  'start_page': 3,
  'end_page': 19},
 {'chapter_number': '3',
  'chapter_title': 'Labeling',
  'start_page': 43,
  'end_page': 55},
 {'chapter_number': '4',
  'chapter_title': 'Sample Weights',
  'start_page': 59,
  'end_page': 72},
 {'chapter_number': '5',
  'chapter_title': 'Fractionally Differentiated Features',
  'start_page': 75,
  'end_page': 88},
 {'chapter_number': '6',
  'chapter_title': 'Ensemble Methods',
  'start_page': 93,
  'end_page': 101},
 {'chapter_number': '7',
  'chapter_title': 'Cross-Validation in Finance',
  'start_page': 103,
  'end_page': 110},
 {'chapter_number': '8',
  'chapter_title': 'Feature Importance',
  'start_page': 113,
  'end_page': 127},
 {'chapter_number': '9',
  'chapter_title': 'Hyper-Parameter Tuning with Cross-Validation',
  'start_page': 129,
  'end_page': 135},
 {'chapter_number': '10',
  'chapter_title': 'Bet Sizing',
  'start_page': 141,
  'end_