In [2]:
import pytesseract
import fitz
import os
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
import pyttsx3
import requests
from sklearn.preprocessing import LabelEncoder

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_pdf(pdf_file):
    pdf_document = fitz.open(pdf_file)
    labels_data = {
        'id': [], 'text': [], 'chars': [], 'width': [], 'height': [],
        'area': [], 'char_size': [], 'pos_x': [], 'pos_y': [], 'aspect': [], 'font_style': []
    }

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if 'lines' in b:
                for l in b["lines"]:
                    for s in l["spans"]:
                        text_segment = s["text"]
                        font_size = s["size"]
                        font_style = s["font"]
                        bbox = s["bbox"]

                        labels_data['id'].append(len(labels_data['id']) + 1)
                        labels_data['text'].append(text_segment)
                        labels_data['chars'].append(len(text_segment))
                        labels_data['width'].append(bbox[2] - bbox[0])
                        labels_data['height'].append(bbox[3] - bbox[1])
                        labels_data['area'].append((bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))
                        labels_data['char_size'].append(font_size)
                        labels_data['pos_x'].append(bbox[0])
                        labels_data['pos_y'].append(bbox[1])
                        labels_data['aspect'].append((bbox[2] - bbox[0]) / (bbox[3] - bbox[1]))
                        labels_data['font_style'].append(font_style)

    return pd.DataFrame(labels_data)

def preprocess_text(text):
    text = text.lower()
    text = ''.join(e for e in text if e.isalnum() or e.isspace())
    text = ' '.join(text.split())
    return text

def train_model(dataframe):
    texts = dataframe['text'].apply(preprocess_text).tolist()
    labels = dataframe['layout'].tolist()

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    label_encoder = LabelEncoder()
    encoded_labels = torch.tensor(label_encoder.fit_transform(labels))

    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], encoded_labels)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    epochs = 3

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids, attention_mask, label = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=label)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

    model.save_pretrained('./model')
    tokenizer.save_pretrained('./model')

    return model, tokenizer, label_encoder

def classify_sections(model, tokenizer, label_encoder, texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        labels = label_encoder.inverse_transform(predictions.numpy())
    return labels

def filter_text(dataframe, labels_to_keep):
    return dataframe[dataframe['layout'].isin(labels_to_keep)]

def text_to_speech(text, output_file):
    engine = pyttsx3.init()
    engine.save_to_file(text, output_file)
    engine.runAndWait()

def main(pdf_file):
    dataframe = extract_text_from_pdf(pdf_file)
    
    dataframe['layout'] = ['Header' if x > 12 else 'Text' for x in dataframe['char_size']]

    model, tokenizer, label_encoder = train_model(dataframe)

    texts = dataframe['text'].apply(preprocess_text).tolist()
    labels = classify_sections(model, tokenizer, label_encoder, texts)
    dataframe['layout'] = labels

    important_sections = ['Header', 'Text']
    filtered_dataframe = filter_text(dataframe, important_sections)

    text_to_speech(' '.join(filtered_dataframe['text']), 'output.mp3')

    print("Audio book created: output.mp3")

if __name__ == "__main__":
    main("2303.16727v2.pdf")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average Loss: 0.0129
Epoch 2/3, Average Loss: 0.0036
Epoch 3/3, Average Loss: 0.0034
Audio book created: output.mp3
