In [16]:
import json
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib
import difflib


In [17]:
import fitz  # PyMuPDF

In [18]:

def extract_lines_with_features(pdf_path):
    doc = fitz.open(pdf_path)
    lines = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    line_text = ""
                    font_sizes = []
                    is_bold = False
                    is_italic = False
                    for span in line["spans"]:
                        line_text += span["text"]
                        font_sizes.append(span["size"])
                        if span["flags"] & 2**4:  # bold
                            is_bold = True
                        if span["flags"] & 2**1:  # italic
                            is_italic = True
                    if font_sizes:
                        font_size = max(set(font_sizes), key=font_sizes.count)
                    else:
                        font_size = 0
                    y_position = line["bbox"][1]  # top y-coordinate
                    lines.append({
                        "text": line_text.strip(),
                        "font_size": font_size,
                        "is_bold": is_bold,
                        "is_italic": is_italic,
                        "y_position": y_position,
                        "page_number": page_num + 1
                    })
    doc.close()
    return lines

def label_lines(lines, json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    title = data.get("title", "")
    outline = data.get("outline", [])
    headings = [(item["text"], item["page"], item["level"]) for item in outline]
    if title:
        headings.insert(0, (title, 1, "title"))
    labeled_lines = []
    for line in lines:
        text = line["text"]
        page = line["page_number"]
        page_headings = [h for h in headings if h[1] == page]
        if page_headings:
            similarities = [difflib.SequenceMatcher(None, text, h[0]).ratio() for h in page_headings]
            if similarities:
                max_sim = max(similarities)
                if max_sim > 0.8:  # Threshold for matching
                    index = similarities.index(max_sim)
                    label = page_headings[index][2]
                else:
                    label = "none"
            else:
                label = "none"
        else:
            label = "none"
        labeled_lines.append((line, label))
    return labeled_lines

def compute_features(line):
    text = line["text"]
    return {
        "font_size": line["font_size"],
        "is_bold": 1 if line["is_bold"] else 0,
        "is_italic": 1 if line["is_italic"] else 0,
        "y_position": line["y_position"],
        "text_length": len(text),
        "is_all_caps": 1 if text.isupper() else 0,
        "is_title_case": 1 if text.istitle() else 0,
        "page_number": line["page_number"]
    }


In [19]:

# Paths to sample dataset
pdf_dir = "sample_dataset/pdfs"
json_dir = "sample_dataset/outputs"
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
json_files = [os.path.join(json_dir, f.replace('.pdf', '.json')) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]


In [20]:

# Collect labeled data
all_labeled_lines = []
for pdf_file, json_file in zip(pdf_files, json_files):
    lines = extract_lines_with_features(pdf_file)
    labeled = label_lines(lines, json_file)
    all_labeled_lines.extend(labeled)

# Prepare training data
X = [compute_features(line) for line, label in all_labeled_lines]
y = [label for line, label in all_labeled_lines]
df = pd.DataFrame(X)


In [21]:

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(df, y_encoded)

# Save model and label encoder
joblib.dump(clf, 'heading_classifier.pkl')
joblib.dump(le, 'label_encoder.pkl')

print("Training complete. Model and label encoder saved as 'heading_classifier.pkl' and 'label_encoder.pkl'.")

Training complete. Model and label encoder saved as 'heading_classifier.pkl' and 'label_encoder.pkl'.
