In [127]:
import fitz
import pandas as pd
import numpy as np
BOLD_FONT_KEYWORDS = ["Bold", "Black", "Heavy", "Demi", "SemiBold"]
def isBold(font):
    for i in BOLD_FONT_KEYWORDS:
        if i in font:
            return True
    return False

def clean_data(df):
    df.drop_duplicates(subset=["text"], inplace=True)
    return df


def add_layout_features(df, page_width=612, page_height=792):
    # print("in features",df)
    df = df.sort_values(by=['page', 'y0'])
    df['line_indent'] = df.groupby('page')['x0'].transform(lambda x: x - x.min())
    df['line_center_offset'] = abs((df['x0'] + df['x1']) / 2 - (page_width / 2))
    df['line_width_ratio'] = (df['x1'] - df['x0']) / page_width
    df['y0_norm'] = df['y0'] / page_height
    
    df['font_size_relative'] = df.groupby('page')['font_size'].transform(lambda x: x - x.median())
    
    df['line_spacing_above'] = df.groupby('page')['y0'].diff().fillna(0)
    df["is_all_caps"] = df["text"].apply(lambda x: int(x.upper() == x))
    df["is_title_case"] = df["text"].apply(lambda x: int(x.istitle()))
    df["line_length_chars"] = df["text"].apply(len)
    df["line_density"] = df["line_length_chars"] / (df["x1"] - df["x0"] + 1e-5)
    df["relative_y0"] = df["y0"] / page_height
    df["is_bold_and_large"] = ((df["font_size"] > df["font_size"].mean()) & df["isbold"]).astype(int)
    # Assume lines are sorted
    # df['is_alone'] = df.groupby('page')['line_spacing_above'].apply(
    #     lambda x: x > x.median() * 1.5  # heuristic
    # ).astype(int)
    # print("out features",df)

    return df





def extract_sentences(pdf_path):
    import pandas as pd
    doc = fitz.open(pdf_path)
    sentances = []
    for page_num, page in enumerate(doc, start=0):
        page_height = page.rect.height
        page_width = page.rect.width
        for block in page.get_text("dict")["blocks"]:
            sentance= {}
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    span_text = span["text"].strip()
                    # print(span['size'],))
                    if not span_text:
                        continue
                    if not sentance:
                        sentance['text'] = span_text
                        sentance['font'] = span['font']
                        sentance['font_size'] = round(span['size'])
                        sentance['word_count'] = len(span_text.split(" "))
                        sentance['x0'],sentance['y0'],sentance['x1'],sentance['y1']=block["bbox"][0],block["bbox"][1],block["bbox"][2],block["bbox"][3]
                        sentance['isbold']=int(isBold(span['font']))
                        sentance["page"] = int(page_num)
                    else:
                        sentance['text'] = sentance['text'] + (" " if sentance['text'] else "") + span_text
                        sentance['word_count'] = len(sentance['text'].split(" "))
                        # if sentance['text'].equalsIgnoreCase('table of content'):
            if (sentance):
                # print("Warning: Empty sentence found on page", page_num,sentance)
                # input("Press Enter to continue...")
                sentances.append(sentance)
    data = pd.DataFrame(sentances)
    # print("extracted data",data)
    data = add_layout_features(data, page_width=page_width, page_height=page_height)
    # print("post extracted data",data[data.isnull().any(axis=1)])
    # clean_data(data)
    # print("post cleaned data",data)
    return clean_data(data)
def label_blocks_from_outline(blocks_df, json_path, threshold=50):
    import json
    from rapidfuzz import fuzz
    with open(json_path, "r", encoding="utf-8") as f:
        f = json.load(f)
        title = f["title"]
        outline = f["outline"]
    #     o["text"] = o["text"].strip()
    #     if o["text"].lower() == title.lower():
    #         o["level"] = "title"

    def match_label(row):
        # for o in outline:
        #     print(row["text"],o["text"].strip(),row["text"]==o["text"].strip())
        matches = [
            (o["text"], o["level"], fuzz.ratio(row["text"], o["text"].strip()))
            for o in outline if o["page"] == row["page"] 
        ]
        # print(matches)
        if matches:
            best_text, best_level, score = max(matches, key=lambda x: x[2])
            if score >= threshold:
                return str(best_level)
        return "other"

    blocks_df["label"] = blocks_df.apply(match_label, axis=1)
    return blocks_df
def prepare_training_data(pdf_files):
    import pathlib
    import json 
    all_data = pd.DataFrame()
    for pdf_file in pdf_files:
        # try:
        print(f"Processing {pdf_file}...")
        df = extract_sentences(pdf_file)
        # print(df['text'])
        json_path = pathlib.Path(pdf_file).with_suffix(".json")
        if json_path.exists():
            df = label_blocks_from_outline(df, json_path)
            print("json",df)
            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"Warning: JSON outline {json_path} not found. Skipping labeling.")
        # except Exception as e:
        #     print("Exception: ",e.with_traceback())
        #     pass
    return clean_data(all_data)

In [92]:
import pathlib
dir = pathlib.Path("./data")
pdf_files=[str(i) for i in dir.glob("*.pdf")]
pdf_files.remove("data\\E0H1CM114.pdf")
df=prepare_training_data(pdf_files)
# df.to_csv("E0CCG5S3122.csv")
# print(df.dtypes)
df

Processing data\combining-heuristics.pdf...
extracted data                                                   text  \
0                                                    1   
1           Combining heuristics and Exact Algorithms:   
2                                             A Review   
3           Hengameh Fakhravar Old Dominion University   
4                                     Hfakh001@odu.edu   
..                                                 ...   
113  [57] A. Plateau, D. Tachat, and P. Tolla, “A h...   
114  [58] G. R. Raidl, “An improved genetic algorit...   
115  [59] M. Vasquez and J. K. Hao, “A hybrid appro...   
116  [60] M. Vasquez and Y. Vimont, “Improved resul...   
117  [61] Tahami, H., Mirzazadeh, A., & Gholami-Qad...   

                       font  font_size  word_count          x0          y0  \
0    TimesNewRomanPS-BoldMT         10           1   54.000000  733.541626   
1         TimesNewRomanPSMT         24           5   91.344002   55.056004   
2         

Unnamed: 0,text,font,font_size,word_count,x0,y0,x1,y1,isbold,page,...,y0_norm,font_size_relative,line_spacing_above,is_all_caps,is_title_case,line_length_chars,line_density,relative_y0,is_bold_and_large,label
0,Combining heuristics and Exact Algorithms:,TimesNewRomanPSMT,24,5,91.344002,55.056004,550.659973,81.624001,0,0,...,0.069515,13.0,0.000000,0,0,42,0.091440,0.069515,0,other
1,A Review,TimesNewRomanPSMT,24,2,254.690002,82.675995,363.309998,109.243996,0,0,...,0.104389,13.0,27.619991,0,1,8,0.073651,0.104389,0,other
2,Hengameh Fakhravar Old Dominion University,TimesNewRomanPSMT,11,5,249.529999,109.583344,365.350006,147.124634,0,0,...,0.138363,0.0,26.907349,0,1,42,0.362632,0.138363,0,other
3,Hfakh001@odu.edu,TimesNewRomanPSMT,10,1,265.489990,147.625641,349.119995,158.651367,0,0,...,0.186396,-1.0,38.042297,0,0,16,0.191319,0.186396,0,other
4,Abstract Several different ways exist for appr...,TimesNewRomanPS-BoldItalicMT,10,136,54.000000,159.056335,308.059998,516.301392,1,0,...,0.200829,-1.0,11.430695,0,0,1030,4.054160,0.200829,1,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1492,except Japan and South Korea,Times New Roman,12,5,174.979996,219.967972,321.694977,233.251968,0,6,...,0.277737,0.0,20.279968,0,0,28,0.190846,0.277737,0,other
1493,Signal Good coverage indoors on 850/900 MHz Un...,Times New Roman,12,24,53.040001,245.557999,556.825012,279.122040,0,6,...,0.310048,0.0,25.590027,0,0,166,0.329506,0.310048,0,other
1494,Handoff Hard Soft,Times New Roman,12,3,53.040001,291.398010,407.725006,304.682007,0,6,...,0.367927,0.0,45.840012,0,1,17,0.047930,0.367927,0,other
1495,Power consumption Less More,Times New Roman,12,4,53.040001,316.958008,413.920013,330.242004,0,6,...,0.400200,0.0,25.559998,0,0,27,0.074817,0.400200,0,other


In [9]:
print(df.head())
print(df.count())
print(df[df.isnull()].count())

                                                text  \
0         Combining heuristics and Exact Algorithms:   
1                                           A Review   
2         Hengameh Fakhravar Old Dominion University   
3                                   Hfakh001@odu.edu   
4  Abstract Several different ways exist for appr...   

                           font  font_size  word_count          x0  \
0             TimesNewRomanPSMT         24           5   91.344002   
1             TimesNewRomanPSMT         24           2  254.690002   
2             TimesNewRomanPSMT         11           5  249.529999   
3             TimesNewRomanPSMT         10           1  265.489990   
4  TimesNewRomanPS-BoldItalicMT         10         136   54.000000   

           y0          x1          y1  isbold  page  ...   y0_norm  \
0   55.056004  550.659973   81.624001       0     0  ...  0.069515   
1   82.675995  363.309998  109.243996       0     0  ...  0.104389   
2  109.583344  365.350006  147.1

In [128]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.utils import resample

# from sentence_transformers import SentenceTransformer

# --------- CONFIGURATION ---------
EMBEDDING_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"
MODEL_OUTPUT_PATH = "header_classifier_combined_new2.pkl"


def train_header_classifier(df):
    from sklearn.utils import resample

    # Separate majority and minority classes
    df_majority = df[df["label"] == "other"]
    df_minority = df[df["label"] != "other"]

    # Upsample minority classes
    # print("\n\n\n\n Before sample",df)
    df_minority_upsampled = resample(
        df_minority,
        replace=True,
        n_samples=(len(df_majority) * 2),  # or adjust as needed
        random_state=42,
    )
    df_balanced = pd.concat([df_majority, df_minority_upsampled])
    df = df_balanced
    # print("\n\n\n\nAfter sample", df)
    df.dropna(inplace=True)
    if(df.empty):
        print("Warning: DataFrame is empty after dropping NaN values.")
        input()
        return
    font_encoder = LabelEncoder()
    df["font"] = df["font"].astype(str)
    font_encoder.fit(list(df["font"].unique()) + ["unknown"])
    df["font_name_encoded"] = df["font"].astype(str)
    df["font_name_encoded"] = df["font_name_encoded"].apply(
        lambda x: x if x in font_encoder.classes_ else "unknown"
    )
    df["font_name_encoded"] = font_encoder.transform(df["font_name_encoded"])
    feature_columns = [
        "page",
        "x0",
        "y0",
        "x1",
        "y1",
        "font_name_encoded",
        "font_size",
        "word_count",
        "isbold",
        "line_indent",
        "line_center_offset",
        "line_width_ratio",
        "y0_norm",
        "font_size_relative",
        "line_spacing_above",
        "is_all_caps",
        "is_title_case",
        "line_length_chars",
        "line_density",
        "relative_y0",
        "is_bold_and_large",
    ]

    X = df[feature_columns]
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=42
    )

    model = lgb.LGBMClassifier(
        # n_estimators=600,
        # max_depth=6,
        # num_leaves=200,
        # learning_rate=0.001,
        # class_weight="balanced",
        n_estimators=600,
        learning_rate=0.001,
        num_leaves=200,
        max_depth=8,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1.0,
        reg_lambda=1.0,
        class_weight="balanced",
        random_state=42,
    )


    target_encoder = LabelEncoder()
    target_encoder.fit(list(y.unique()))
    y_train_endoded = target_encoder.transform(y_train)
    y_test_encoded = target_encoder.transform(y_test)
    # print(target_encoder.classes_)
    model.fit(X_train, y_train)
    model_train=model.predict_proba(X_test)
    # print(X_test,target_encoder.inverse_transform(np.argmax(model_train,axis=1)),np.argmax(model_train,axis=1),y_train)
    meta_model = LogisticRegression(max_iter=500)
    meta_model.fit(model_train, np.argmax(model_train,axis=1))
    # Stacked model
    stacked_model = StackingClassifier(
        estimators=[("lgb", model)],
        final_estimator=meta_model,
        passthrough=True,
        n_jobs=-1,
    )

    # Train
    stacked_model.fit(X_train, y_train)

    joblib.dump(
        {
            "model": stacked_model,
            "meta_model": meta_model,
            "font_encoder": font_encoder,
            "target_encoder": target_encoder,
            "feature_columns": feature_columns,
        },
        MODEL_OUTPUT_PATH,
    )

    print(f"✅ Model saved as {MODEL_OUTPUT_PATH}")


train_header_classifier(df)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3249
[LightGBM] [Info] Number of data points in the train set: 3242, number of used features: 21
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Model saved as header_classifier_combined_new2.pkl


In [129]:
import joblib
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load trained components
model_bundle = joblib.load(MODEL_OUTPUT_PATH)

model = model_bundle['model']
font_encoder = model_bundle['font_encoder']
target_encoder = model_bundle['target_encoder']
meta_model = model_bundle["meta_model"]
# embedding_model_name = model_bundle['embedding_model_name']
feature_columns = model_bundle['feature_columns']
print(feature_columns)
# Load embedding model locally
# embedding_model = SentenceTransformer(embedding_model_name)


def predict_headers(pdf_path):
    # Step 1: Extract sentences from new PDF
    df_new = extract_sentences(pdf_path)

    # Step 2: Encode font names using trained label encoder
    df_new['font'] = df_new['font'].astype(str)
    df_new['font_name_encoded'] = df_new['font'].apply(
        lambda x: x if x in font_encoder.classes_ else 'unknown'
    )
    df_new['font_name_encoded'] = font_encoder.transform(df_new['font_name_encoded'])

    # Step 3: Generate embeddings (offline)
    # embeddings = embedding_model.encode(df_new['text'].astype(str).tolist(), batch_size=32, show_progress_bar=True)
    # embedding_df = pd.DataFrame(embeddings, columns=[f'embed_{i}' for i in range(embeddings.shape[1])])
    # df_new = pd.concat([df_new.reset_index(drop=True), embedding_df], axis=1)

    # Step 4: Select final feature set
    X_new = df_new[feature_columns]
    print(feature_columns)
    # Step 5: Predict using LightGBM model
    lgb_probs = model.predict_proba(X_new)
    meta_preds = meta_model.predict(lgb_probs)
    df_new['predicted_label'] = target_encoder.inverse_transform(meta_preds)
    # df_new = clean_data(df_new)
    return df_new[['text', 'page', 'predicted_label']]



predictions_df = predict_headers("./data/E0H1CM114.pdf")

# See predictions:
print(predictions_df[predictions_df['predicted_label']!='other'])

['page', 'x0', 'y0', 'x1', 'y1', 'font_name_encoded', 'font_size', 'word_count', 'isbold', 'line_indent', 'line_center_offset', 'line_width_ratio', 'y0_norm', 'font_size_relative', 'line_spacing_above', 'is_all_caps', 'is_title_case', 'line_length_chars', 'line_density', 'relative_y0', 'is_bold_and_large']
['page', 'x0', 'y0', 'x1', 'y1', 'font_name_encoded', 'font_size', 'word_count', 'isbold', 'line_indent', 'line_center_offset', 'line_width_ratio', 'y0_norm', 'font_size_relative', 'line_spacing_above', 'is_all_caps', 'is_title_case', 'line_length_chars', 'line_density', 'relative_y0', 'is_bold_and_large']
                                                  text  page predicted_label
0                 Ontario’s Libraries Working Together     0              H1
2    To Present a Proposal for Developing the Busin...     0              H1
5                            Ontario’s Digital Library     1              H1
6    A Critical Component for Implementing Ontario’...     1              H1

In [121]:
[1,2,3] in [1,2,3,4,5,6,7,8,9]

False

In [125]:
import pandas as pd
import numpy as np
import joblib
from sklearn.utils import resample
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the model once
model_bundle = joblib.load(MODEL_OUTPUT_PATH)
model = model_bundle['model']
# # semantic_model = model_bundle['semantic_model']
font_encoder = model_bundle['font_encoder']
# font_encoder = model_bundle['font_encoder']

target_encoder = model_bundle['target_encoder']
# # embedding_model_name = model_bundle['embedding_model_name']
layout_features = model_bundle['feature_columns']
# embedding_model = SentenceTransformer(embedding_model_name)


# STEP 1: Function to Predict and Generate Pseudo-Labels (with Confidence Filtering)
def generate_pseudo_labels(pdf_path, confidence_threshold=0.5):
    df_new = extract_sentences(pdf_path)
    # print("pused here",df_new[df_new.isnull().any(axis=1)],df)
    # print(f"Extracted {df_new} sentences from {pdf_path}")
    df_new['font'] = df_new['font'].astype(str)
    df_new['font_name_encoded'] = df_new['font'].apply(
        lambda x: x if x in font_encoder.classes_ else 'unknown'
    )
    print("font_name_encoded",df_new['font_name_encoded'],font_encoder.classes_)
    df_new['font_name_encoded'] = font_encoder.transform(df_new['font_name_encoded'])
    print("font_name_encoded poxt",df_new[df_new['font_name_encoded'].isnull()==True])

    X_layout = df_new[layout_features]
    # print(layout_features)
    # embeddings = embedding_model.encode(df_new['text'].astype(str).tolist(), batch_size=32, show_progress_bar=True)

    layout_probs = model.predict_proba(X_layout)
    # semantic_probs = semantic_model.predict_proba(embeddings)

    # fusion_probs = (layout_probs + semantic_probs) / 2

    # max_probs = fusion_probs.max(axis=1)
    predicted_labels = target_encoder.inverse_transform(np.argmax(layout_probs, axis=1))
    # print(layout_probs ,predicted_labels)
    # high_conf_mask = layout_probs >= confidence_threshold
    high_conf_mask = layout_probs.max(axis=1) >= confidence_threshold
    df_pseudo = df_new[high_conf_mask].copy()
    df_pseudo['label'] = predicted_labels[high_conf_mask]
    # print("post pused",df_pseudo)

    print(f"Pseudo-labeled {len(df_pseudo)} high-confidence sentences from {pdf_path}")

    return df_pseudo


# STEP 2: Retrain Models on Combined Data
def retrain_hybrid_model(df):
    # Handle font encoding
    le = font_encoder
    df['font'] = df['font'].astype(str)
    df['font_name_encoded'] = df['font'].apply(lambda x: x if x in le.classes_ else 'unknown')
    df['font_name_encoded'] = le.transform(df['font_name_encoded'])

    X_layout = df[layout_features]
    y = df['label']
    # X_embed = embedding_model.encode(df['text'].astype(str).tolist(), batch_size=32, show_progress_bar=True)

    X_layout_train, X_layout_test, y_train, y_test = train_test_split(
        X_layout, y, stratify=y, random_state=42
    )

    layout_model = lgb.LGBMClassifier(
        n_estimators=500,
        max_depth=6,
        num_leaves=200,
        learning_rate=0.0001,
         class_weight='balanced'
    )
    layout_model.fit(X_layout_train, y_train)

    # semantic_model = LogisticRegression(max_iter=100, class_weight='balanced')
    # # semantic_model.fit(X_embed_train, y_train)
    # layout_acc = accuracy_score(y_val, layout_val_preds)
    # semantic_acc = accuracy_score(y_val, semantic_val_preds)
    # final_acc = accuracy_score(y_val, final_val_preds)
    # final_f1 = f1_score(y_val, final_val_preds, average='weighted')

    # print(f"\n📊 Layout Model Accuracy: {layout_acc:.4f}")
    # print(f"📊 Semantic Model Accuracy: {semantic_acc:.4f}")
    # print(f"📊 Fused Prediction Accuracy: {final_acc:.4f}")
    # print(f"📊 Fused Prediction F1-Score: {final_f1:.4f}")

    joblib.dump({
        'model': model,
        'font_encoder': font_encoder,
        'feature_columns': layout_features
        
    },MODEL_OUTPUT_PATH)

    print("✅ Retrained model saved.")
    return layout_model


# STEP 3: Recursive Self-Training Loop
def recursive_self_train(initial_df, unlabeled_pdfs, iterations=3, confidence_threshold=0.9):
    combined_df = initial_df.copy()

    for i in range(iterations):
        
        print(f"\n🔁 Iteration {i+1}/{iterations}")
        # print("initial_df ",initial_df)
        for pdf_path in unlabeled_pdfs:
            print("combined before generation ",initial_df)
            try:
                pseudo_df = generate_pseudo_labels(pdf_path, confidence_threshold=confidence_threshold)
                combined_df = pd.concat([combined_df, pseudo_df], ignore_index=True)
                # print("combined after generation ",initial_df)
            except:
                pass
            # Optional: Rebalance (skip if unwanted)
            df_majority = combined_df[combined_df['label'] == 'other']
            df_minority = combined_df[combined_df['label'] != 'other']

            df_minority_upsampled = resample(
                df_minority, replace=True, n_samples=len(df_majority)*2, random_state=42
            )
            combined_df = pd.concat([df_majority, df_minority_upsampled])

            # Retrain
            layout_model = train_header_classifier(combined_df)
        if confidence_threshold<=0.9:
            confidence_threshold+= 0.05  # Adjust confidence threshold for next iteration
            print(f"Updated confidence threshold: {confidence_threshold:.2f}")
        

    print("\n✅ Recursive self-training completed.")
    return combined_df


In [126]:
# Load your true labeled data
import pathlib
dir = pathlib.Path("./data")
pdf_files=[str(i) for i in dir.glob("*.pdf")]
pdf_files.remove("data\\E0H1CM114.pdf")
df=prepare_training_data(pdf_files)
initial_df = df

# List of unlabeled PDFs to self-train on
unlabeled_pdfs =(list(pathlib.Path("./Pdf").glob("*.pdf")))[:3]

# Run recursive self-training
recursive_self_train(
    initial_df,
    unlabeled_pdfs,
    iterations=1,
    confidence_threshold=0.6  # Can be tuned
)

Processing data\combining-heuristics.pdf...
extracted data                                                   text  \
0                                                    1   
1           Combining heuristics and Exact Algorithms:   
2                                             A Review   
3           Hengameh Fakhravar Old Dominion University   
4                                     Hfakh001@odu.edu   
..                                                 ...   
113  [57] A. Plateau, D. Tachat, and P. Tolla, “A h...   
114  [58] G. R. Raidl, “An improved genetic algorit...   
115  [59] M. Vasquez and J. K. Hao, “A hybrid appro...   
116  [60] M. Vasquez and Y. Vimont, “Improved resul...   
117  [61] Tahami, H., Mirzazadeh, A., & Gholami-Qad...   

                       font  font_size  word_count          x0          y0  \
0    TimesNewRomanPS-BoldMT         10           1   54.000000  733.541626   
1         TimesNewRomanPSMT         24           5   91.344002   55.056004   
2         

KeyboardInterrupt: 

In [None]:
predictions_df = predict_headers("./pdf/672YOEHR4MIARPSZVTI7ERPCKKA3FNIY.pdf")

# See predictions:









print(predictions_df[predictions_df['predicted_label']!='other'])

['page', 'x0', 'y0', 'x1', 'y1', 'font_name_encoded', 'font_size', 'word_count', 'isbold', 'line_indent', 'line_center_offset', 'line_width_ratio', 'y0_norm', 'font_size_relative', 'line_spacing_above', 'is_all_caps', 'is_title_case', 'line_length_chars', 'line_density', 'relative_y0', 'is_bold_and_large']
                                                 text  page  predicted_label
0   Frequently Asked Questions: Financial Aid for ...     0                4
1                          Financial Aid for Students     0                4
2   Guides students through the process of locatin...     0                4
3   - The basics: getting started - Student aid an...     0                4
4                     The basics: getting started TOP     0                4
..                                                ...   ...              ...
74                            Repaying your loans TOP     5                1
75  After college, the federal government has ways...     5                4