In [None]:
import layoutparser as lp
import cv2

# Load the model with the local path
# model = lp.Detectron2LayoutModel(
#     config_path='model/config.yaml',
#     model_path='model/model_final.pth',
#     label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
#     extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8]
# )

import layoutparser as lp
model = lp.Detectron2LayoutModel(
            config_path ='lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config', # In model catalog
            label_map   ={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}, # In model`label_map`
            extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8] # Optional
        )

In [None]:
import fitz  # PyMuPDF
import os
import layoutparser as lp
import cv2
import pytesseract
from pytesseract import Output
from PIL import Image, ImageDraw, ImageFont
import re
import numpy as np
import pickle
import pandas as pd
from fuzzywuzzy import fuzz
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Convert PDF pages to images
def render_pdf_pages_to_images(pdf_path, image_folder, zoom=2):
    try:
        if not os.path.exists(image_folder):
            os.makedirs(image_folder)

        document = fitz.open(pdf_path)
        for page_number, page in enumerate(document):
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat)
            image_filename = f"{image_folder}/output_page_{page_number + 1}.png"
            pix.save(image_filename)
            logging.info(f"Saved {image_filename}")

        document.close()
    except Exception as e:
        logging.error(f"Error rendering PDF pages to images: {e}")

# Combine font information extracted by PyMuPDF and Tesseract
def extract_combined_font_info(page, image_path):
    try:
        # Extract font info using PyMuPDF
        font_info_pymupdf = []
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_info_pymupdf.append({
                            "bold": span["flags"] & 2 != 0,
                            "italic": span["flags"] & 1 != 0,
                            "size": span["size"],
                            "text": span["text"]
                        })
        
        # Extract font info using Tesseract
        img = cv2.imread(image_path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        data = pytesseract.image_to_data(img_rgb, output_type=Output.DICT)
        
        # Check if the necessary keys are in the Tesseract output
        required_keys = ['level', 'text', 'height']
        for key in required_keys:
            if key not in data:
                logging.error(f"Key '{key}' not found in Tesseract output")
                return []

        font_info_tesseract = []
        for i in range(len(data['level'])):
            text = data['text'][i].strip()
            if text:
                font_info_tesseract.append({
                    "bold": 'Bold' in data.get('font', [''])[i] if 'font' in data else False,
                    "italic": 'Italic' in data.get('font', [''])[i] if 'font' in data else False,
                    "size": int(data['height'][i]),
                    "text": text
                })
        
        # Combine results
        combined_font_info = []
        for pymupdf_info in font_info_pymupdf:
            for tess_info in font_info_tesseract:
                if fuzz.ratio(pymupdf_info['text'], tess_info['text']) > 80:
                    combined_info = {
                        "bold": pymupdf_info["bold"] or tess_info["bold"],
                        "italic": pymupdf_info["italic"] or tess_info["italic"],
                        "size": max(pymupdf_info["size"], tess_info["size"]),
                        "text": pymupdf_info["text"]
                    }
                    combined_font_info.append(combined_info)
                    break
        return combined_font_info
    except Exception as e:
        logging.error(f"Error extracting combined font info: {e}")
        return []

# Check if a block is a potential caption
def is_potential_caption(block, figure_blocks, distance=20, word_limit=50):
    text = block.text
    if "©" in text:
        return True

    title_keywords = ["funding", "conflict of interest", "supplementary materials", "declaration",
                      "acknowledgments", "data availability", "author contributions", "publisher's note", "appendix"]

    if block.type == 'Title':
        for keyword in title_keywords:
            if fuzz.partial_ratio(text.lower(), keyword) > 50:
                return False

    if re.match(r'^(Fig\.|Figure|Table|Box)\s*\d+|^(supplementary\w*|appendix\w*|fund\w*|conflict of interest\w*|data availability\w*|publi\w*|abbreviation\w*|author\w*|the author|copyright|correspond\w*|assess\w*|email\w*|tel\w*|open access|keywords|key words|address\w*|receive\w*|review\w*)', text, re.IGNORECASE):
        return True

    for figure_block in figure_blocks:
        x0, y0, x1, y1 = figure_block.coordinates
        bx0, by0, bx1, by1 = block.coordinates

        if (
            abs(y0 - by1) <= distance or abs(y1 - by0) <= distance or
            abs(x0 - bx1) <= distance or abs(x1 - bx0) <= distance
        ):
            if len(block.text.strip().split()) < word_limit:
                return True
    return False 

# Sort blocks by coordinates with a tolerance of 50
TOLERANCE = 50

def sort_blocks_by_coordinates(blocks, tolerance=TOLERANCE):
    if not blocks:
        return []

    blocks = sorted(blocks, key=lambda b: b.coordinates[0])
    
    sorted_blocks = []
    current_line = []
    current_x = blocks[0].coordinates[0]
    
    for block in blocks:
        if abs(block.coordinates[0] - current_x) <= tolerance:
            current_line.append(block)
        else:
            current_line.sort(key=lambda b: b.coordinates[1])
            sorted_blocks.extend(current_line)
            current_line = [block]
            current_x = block.coordinates[0]
    
    current_line.sort(key=lambda b: b.coordinates[1])
    sorted_blocks.extend(current_line)
    
    return sorted_blocks

def draw_box(image, layout, box_width=3, show_element_id=True):
    pil_image = Image.fromarray(image)
    draw = ImageDraw.Draw(pil_image)

    for element in layout:
        box = element.coordinates
        draw.rectangle(box, outline="red", width=box_width)

        if show_element_id and hasattr(element, 'id'):
            font = ImageFont.load_default()
            text = f"ID: {element.id}"
            text_bbox = font.getbbox(text)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]
            draw.rectangle([box[0], box[1] - text_height, box[0] + text_width, box[1]], fill="red")
            draw.text((box[0], box[1] - text_height), text, fill="white", font=font)

    image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    return image

# Modified function to extract abstract from the first pages
def extract_abstract_from_first_pages(images, model, output_folder):
    try:
        ocr_agent = lp.TesseractAgent(languages='eng')
        abstract_text = ""
        first_title = None
        abstract_block_info = []
        
        for page_number, image in enumerate(images):
            layout = model.detect(image)
            if layout is None:
                raise ValueError("Model detection returned None")

            for idx, block in enumerate(layout):
                block.id = idx
                segment_image = block.pad(left=5, right=5, top=5, bottom=5).crop_image(image)
                text = ocr_agent.detect(segment_image).strip()
                block.set(text=text, inplace=True)
            
            abstract_block = None

            # Step 1: Look for an "Abstract" title block
            for block in layout:
                if block.type == 'Title' and block.text.strip().lower() == "abstract":
                    abstract_block = block
                    break

            if abstract_block and abstract_block.id is not None:
                # Collect text blocks below the "Abstract" title block
                for block in layout:
                    if block.id is not None and block.id > abstract_block.id:
                        if block.type == 'Title' and block.text.strip().lower() == "introduction":
                            break  # Stop at the "Introduction" title block
                        abstract_text += block.text.strip() + " "
                        abstract_block_info.append({
                            "Page Name": f"Page {page_number + 1}",
                            "Block ID": block.id,
                            "Text": block.text.strip()
                        })
                        if len(block.text.strip()) == 0:
                            break

            if not abstract_text.strip():
                # Step 2: Keyword matching
                keywords = ["aim", "aims", "background", "purpose", "purposes", "introduction", "objective", "objectives", 
                            "method", "methods", "material","materials", "materials and methods", "introduction",
                            "result","results", "conclusion", "conclusions", "discussion", "areas covered", "expert opinion", "background and objectives"]
                keyword_blocks = []
                
                for block in layout:
                    text = block.text.strip().lower()
                    if any(fuzz.partial_ratio(text, keyword) > 80 for keyword in keywords):
                        keyword_blocks.append(block)
                
                if keyword_blocks:
                    # Sort keyword blocks by y-coordinate and collect nearby blocks within 20 pixels
                    keyword_blocks = sorted(keyword_blocks, key=lambda b: b.coordinates[1])
                    for keyword_block in keyword_blocks:
                        for block in layout:
                            if abs(block.coordinates[1] - keyword_block.coordinates[1]) <= 20:
                                abstract_text += block.text.strip() + " "
                                abstract_block_info.append({
                                    "Page Name": f"Page {page_number + 1}",
                                    "Block ID": block.id,
                                    "Text": block.text.strip()
                                })

            if not first_title:
                # Look for the first title that is not "Abstract"
                for block in layout:
                    if block.type == 'Title' and block.text.strip().lower() != "abstract":
                        first_title = block.text.strip().replace('\n', ' ')
                        break

        # Remove the word "abstract" from the abstract text
        abstract_text = abstract_text.replace("abstract", "").strip()

        if not abstract_text.strip():
            # Step 3: Heuristic method
            text_blocks = [block.text.strip() for block in layout if block.type == 'Text']
            for text in text_blocks[:5]:  # Assuming the abstract appears in the first five blocks
                if len(text.split()) > 50:
                    abstract_text = text
                    break

        # Save abstract block information to a CSV file
        df_abstract_info = pd.DataFrame(abstract_block_info)
        df_abstract_info.to_csv(os.path.join(output_folder, 'abstract_info.csv'), index=False)

        return abstract_text.strip(), first_title
    except Exception as e:
        logging.error(f"Error extracting abstract: {e}")
        return "", None

# Generate a table of title information
def generate_title_info_table(sorted_blocks, font_info_dict, output_folder):
    title_data = []
    for block in sorted_blocks:
        if block.type == 'Title':
            page_number = block.page_number
            text = block.text.strip()
            font_info = next((info for info in font_info_dict[page_number] if info['text'] in text), None)
            title_data.append({
                "Text": text,
                "Page Number": page_number,
                "Block ID": block.id,
                "Bold": font_info.get("bold", False) if font_info else False,
                "Italic": font_info.get("italic", False) if font_info else False,
                "Size": font_info.get("size", 0) if font_info else 0
            })

    df = pd.DataFrame(title_data)
    df = df.sort_values(by=["Page Number", "Block ID"]).reset_index(drop=True)
    return df

def filter_useless_title_blocks(df, title_keywords, output_folder):
    df_filtered = df[~df['Text'].apply(lambda x: any(fuzz.partial_ratio(x.strip().lower(), keyword) > 90 for keyword in title_keywords))]
    df_filtered.to_csv(os.path.join(output_folder, 'useful_title_block.csv'), index=False)
    return df_filtered

# Find reference title
def find_reference_title(df):
    reference_titles = ["introduction", "background", "conclusion","conclusions", "references"]
    for title in reference_titles:
        for index, row in df.iterrows():
            if title in row['Text'].lower():
                logging.info(f"Reference title found: {row['Text']}")
                return row
    return None

# Update title info table and mark similar titles
def mark_similar_titles(df, reference_title, paper_title):
    try:
        def is_similar_row(row1, row2):
            return (
                row1['Bold'] == row2['Bold'] and
                row1['Italic'] == row2['Italic'] and
                abs(row1['Size'] - row2['Size']) <= 1
            )

        if reference_title is not None:
            df['Similar'] = df.apply(lambda x: "title" if x['Text'] == paper_title else ("Ref" if x.equals(reference_title) else is_similar_row(x, reference_title)), axis=1)
        else:
            df['Similar'] = df.apply(lambda x: "title" if x['Text'] == paper_title else False, axis=1)

        for index, row in df.iterrows():
            if row['Similar'] != False:
                logging.info(f"Row {index}: {row['Text']} | Bold: {row['Bold']} | Italic: {row['Italic']} | Size: {row['Size']} | Similar: {row['Similar']}")

        for index, row in df.iterrows():
            if row['Similar'] != False:
                logging.info(f"Section heading found: {row['Text']}")

        df['New ID'] = df.index + 1
        return df
    except Exception as e:
        logging.error(f"Error marking similar titles: {e}")
        return df

# Extract text from images
def extract_text_from_images(image_folder, output_folder, model, pdf_path):
    try:
        ocr_agent = lp.TesseractAgent(languages='eng')
        all_text_blocks = []
        all_captions = []
        font_info_dict = {}
        abstract_text = None
        first_title = None
        first_page_processed = False

        # Extract abstract and title
        if not first_page_processed:
            first_image_path = os.path.join(image_folder, 'output_page_1.png')
            second_image_path = os.path.join(image_folder, 'output_page_2.png')
            images = [cv2.imread(first_image_path)[..., ::-1], cv2.imread(second_image_path)[..., ::-1]]
            abstract_text, first_title = extract_abstract_from_first_pages(images, model, output_folder)
            first_page_processed = True

        # Load PDF once
        document = fitz.open(pdf_path)
        import re

        def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
            return [int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)]

        sorted_image_files = sorted(os.listdir(image_folder), key=natural_sort_key)

        def process_page(page_number, image_name):
            try:
                image_path = os.path.join(image_folder, image_name)
                image = cv2.imread(image_path)
                image = image[..., ::-1]

                page = document.load_page(page_number - 1)
                font_info = extract_combined_font_info(page, image_path)
                if not font_info:
                    logging.warning(f"Font info extraction failed for page {page_number}")

                layout = model.detect(image)
                if layout is None:
                    raise ValueError("Model detection returned None")

                for block in layout:
                    segment_image = block.pad(left=5, right=5, top=5, bottom=5).crop_image(image)
                    text = ocr_agent.detect(segment_image)
                    block.set(text=text, inplace=True)
                    block.page_number = page_number

                text_blocks = lp.Layout([b for b in layout if b.type in ['Text', 'Title', 'List']])
                figure_blocks = lp.Layout([b for b in layout if b.type in ['Figure', 'Table']])
                text_blocks = lp.Layout([b for b in text_blocks if not any(b.is_in(b_fig) for b_fig in figure_blocks)])
                
                sorted_blocks = sort_blocks_by_coordinates(text_blocks)

                if sorted_blocks:  # Check if empty
                    for idx, block in enumerate(sorted_blocks):
                        block.id = idx  # Directly set id attribute

                captions = []
                filtered_text_blocks = []

                for block in sorted_blocks:
                    if is_potential_caption(block, figure_blocks):
                        captions.append(block)
                    else:
                        filtered_text_blocks.append(block)

                image_with_boxes = draw_box(image, sorted_blocks)
                output_image_filename = os.path.join(output_folder, 'images_with_boxes', f"{os.path.splitext(image_name)[0]}_with_boxes.png")
                cv2.imwrite(output_image_filename, image_with_boxes)

                return filtered_text_blocks, captions, font_info

            except Exception as e:
                logging.error(f"Error processing page {page_number}: {e}")
                return [], [], []

        # Use multithreading to process each page in parallel
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = [executor.submit(process_page, int(re.search(r'\d+', image_name).group()), image_name) for image_name in sorted_image_files]
            for future in as_completed(futures):
                filtered_text_blocks, captions, font_info = future.result()
                all_text_blocks.extend(filtered_text_blocks)
                all_captions.extend(captions)
                if font_info:
                    page_number = filtered_text_blocks[0].page_number if filtered_text_blocks else captions[0].page_number
                    font_info_dict[page_number] = font_info

        os.makedirs(os.path.join(output_folder, 'text'), exist_ok=True)
        os.makedirs(os.path.join(output_folder, 'pickle'), exist_ok=True)

        with open(os.path.join(output_folder, 'text', 'captions.txt'), 'w', encoding='utf-8') as caption_file:
            for caption in all_captions:
                caption_file.write(caption.text + "\n")
        
        all_text_blocks.sort(key=lambda b: (b.page_number, b.id))

        if abstract_text:
            with open(os.path.join(output_folder, 'text', 'Abstract.txt'), 'w', encoding='utf-8') as abstract_file:
                abstract_file.write(abstract_text)
            with open(os.path.join(output_folder, 'pickle', 'Abstract.pkl'), 'wb') as abstract_pkl_file:
                pickle.dump(abstract_text, abstract_pkl_file)

        return all_text_blocks, font_info_dict, abstract_text, first_title
    except Exception as e:
        logging.error(f"Error extracting text from images: {e}")
        return [], {}, "", None

# Extract and save sections
def extract_and_save_sections(sorted_blocks, df, output_folder):
    sections = {}
    current_section = None

    for block in sorted_blocks:
        if block.type == 'Title':
            text = block.text.strip()
            if re.match(r'^\d+(\.\d+)?', text):
                if re.match(r'^\d+\.\d+', text):
                    if current_section:
                        sections[current_section].append(text)
                else:
                    current_section = text
                    sections[current_section] = []
            elif text.lower() in df[df['Similar'].isin([True, 'Ref'])]['Text'].str.lower().tolist():
                current_section = text
                sections[current_section] = []

        if current_section and block.type != 'Title':
            sections[current_section].append(block.text.strip())

    section_order = []
    for section, texts in sections.items():
        section_text = " ".join(texts).replace('\n', ' ')
        filename = f"{section}.pkl".replace(" ", "_").replace("/", "_")
        section_order.append((section, filename))
        with open(os.path.join(output_folder, 'pickle', filename), 'wb') as f:
            pickle.dump(section_text, f)
        with open(os.path.join(output_folder, 'text', f"{section}.txt".replace(" ", "_").replace("/", "_")), 'w', encoding='utf-8') as section_file:
            section_file.write(section_text)
        logging.info(f"Saved {section} to {filename}")

    return section_order

# Generate CSV file
def generate_csv(output_root_folder, df_data):
    rows = []
    for data in df_data:
        row = {'PDF name': data['pdf_name'], 'article': data['paper_title'], 'abstract': data['abstract']}
        for idx, (section, filename) in enumerate(data['section_order'], start=1):
            if filename == 'Abstract.pkl':
                continue
            row[f'section {idx}'] = filename
        rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(os.path.join(output_root_folder, 'all_papers.csv'), index=False)

# Process all PDFs in a folder
def process_pdfs_in_folder(pdf_folder, output_root_folder, model):
    try:
        df_data = []

        for pdf_file in os.listdir(pdf_folder):
            if pdf_file.endswith('.pdf'):
                pdf_path = os.path.join(pdf_folder, pdf_file)
                pdf_name = os.path.splitext(pdf_file)[0]
                pdf_output_folder = os.path.join(output_root_folder, pdf_name)

                for folder in ['images', 'images_with_boxes', 'text', 'pickle']:
                    os.makedirs(os.path.join(pdf_output_folder, folder), exist_ok=True)

                render_pdf_pages_to_images(pdf_path, os.path.join(pdf_output_folder, 'images'))

                all_text_blocks, font_info_dict, abstract_text, first_title = extract_text_from_images(os.path.join(pdf_output_folder, 'images'), pdf_output_folder, model, pdf_path)
                
                df = generate_title_info_table(all_text_blocks, font_info_dict, pdf_output_folder)

                reference_title = find_reference_title(df)

                df = mark_similar_titles(df, reference_title, first_title)
                df.to_csv(os.path.join(pdf_output_folder, 'title_blocks_info.csv'), index=False)

                section_order = extract_and_save_sections(all_text_blocks, df, pdf_output_folder)
                
                # Sort section_order by New ID in df
                sorted_section_order = sorted(section_order, key=lambda x: df[df['Text'] == x[0]]['New ID'].values[0] if x[0] in df['Text'].values else float('inf'))

                df_data.append({
                    'pdf_name': pdf_name,
                    'paper_title': first_title,
                    'abstract': 'Abstract.pkl',
                    'section_order': sorted_section_order
                })

        generate_csv(output_root_folder, df_data)
    except Exception as e:
        logging.error(f"Error processing PDFs in folder: {e}")

# Usage example
pdf_folder = 'Diabetes PDFs'
output_root_folder = 'Diabetes PDFs Outputs'

process_pdfs_in_folder(pdf_folder, output_root_folder, model)
