In [None]:
## Step 1: Define Features

import pandas as pd
from sklearn.model_selection import train_test_split

# 1) Load your CSV
df = pd.read_csv("output_dataset.csv")

# 2) Feature engineering
df["text_length"] = df["text"].astype(str).str.len()
df["starts_with_number"] = df["text"].str.match(r"^\d").astype(int)
df["ends_with_colon"]    = df["text"].str.endswith(":").astype(int)
df["title_case"]         = df["text"].str.istitle().astype(int)
df["uppercase_ratio"]    = df["text"].apply(lambda t: sum(1 for c in t if c.isupper())/max(len(t),1))

# 3) Define feature sets
numeric_feats = ["font_size", "x0", "y0", "text_length", "uppercase_ratio"]
binary_feats  = ["is_bold", "is_italic", "starts_with_number", "ends_with_colon", "title_case"]
features      = numeric_feats + binary_feats


In [None]:
## Step 2: Split Data

# 4) Prepare data for the two tasks

# — Task A: heading vs non-heading
X_bin = df[features]
y_bin = df["is_heading"]

# — Task B: heading-level classification (only on true headings)
headings = df[df["is_heading"] == 1]
X_lvl = headings[features]
y_lvl = headings["heading_level"]

# 5) Split into train/test
Xb_train, Xb_test, yb_train, yb_test = train_test_split(X_bin, y_bin, test_size=0.2, random_state=42)
Xl_train, Xl_test, yl_train, yl_test = train_test_split(X_lvl, y_lvl, test_size=0.2, random_state=42)

print("Shapes — heading detector:", Xb_train.shape, Xb_test.shape)
print("Shapes — level classifier:", Xl_train.shape, Xl_test.shape)


Shapes — heading detector: (809, 10) (203, 10)
Shapes — level classifier: (361, 10) (91, 10)


In [None]:
## Step 3: Build a scikit-learn Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Preprocessor
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_feats),
    ("bin", "passthrough",  binary_feats),
])

# Heading detector
heading_clf = Pipeline([
    ("prep", preprocessor),
    ("rf",   RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)),
])
heading_clf.fit(Xb_train, yb_train)
print(classification_report(yb_test, heading_clf.predict(Xb_test)))

# Level classifier
level_clf = Pipeline([
    ("prep", preprocessor),
    ("rf",   RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)),
])
level_clf.fit(Xl_train, yl_train)
print(classification_report(yl_test, level_clf.predict(Xl_test)))

# Save
joblib.dump(heading_clf, "heading_detector.joblib", compress=3)
joblib.dump(level_clf,   "level_classifier.joblib",   compress=3)


              precision    recall  f1-score   support

           0       0.83      0.86      0.84       100
           1       0.86      0.83      0.84       103

    accuracy                           0.84       203
   macro avg       0.84      0.84      0.84       203
weighted avg       0.84      0.84      0.84       203

              precision    recall  f1-score   support

          H1       0.75      0.50      0.60        12
          H2       0.82      0.95      0.88        39
          H3       0.89      0.87      0.88        39
          H4       0.00      0.00      0.00         1

    accuracy                           0.85        91
   macro avg       0.62      0.58      0.59        91
weighted avg       0.83      0.85      0.84        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


['level_classifier.joblib']

In [10]:
import joblib

joblib.dump(
    heading_clf,
    r"C:/Users/adamy/PycharmProjects/PythonProject/adobe_1a_outline_extractor/models/heading_classifier.pkl",
    compress=3
)


['C:/Users/adamy/PycharmProjects/PythonProject/adobe_1a_outline_extractor/models/heading_classifier.pkl']

In [11]:
import joblib

joblib.dump(
    level_clf,
    r"C:/Users/adamy/PycharmProjects/PythonProject/adobe_1a_outline_extractor/models/level_classifier.pkl",
    compress=3
)
print("✅ Level classifier saved.")


✅ Level classifier saved.
