# **Installing Needed packages**

**YOLO MODEL**

In [None]:
!pip install ultralyticsplus==0.0.28 ultralytics==8.0.43

In [None]:
!pip install torch==1.13.1 sahi==0.10.5 transformers==4.31.0

In [None]:
!pip install pybboxes==0.1.6

In [None]:
!pip install torchvision

In [None]:
!pip install transformers

In [None]:
!pip install keras --upgrade

In [None]:
!sudo apt-get install poppler-utils

**TF Model**

In [None]:
!pip install --upgrade tensorflow

In [None]:
!pip install torch

In [None]:
!pip install einops flash_attn timm

**PDF**

In [None]:
!pip install PyMuPDF pdf2image

# **Loading Model**

**Our Model**

In [None]:
from keras.models import load_model
import pickle

# Define the paths
model_path = './model_last_train.keras'
tokenizer_path = './tokenizer_last_train.pickle'
label_encoder_path = './label_encoder_last_train.pickle'

model_1 = load_model(model_path)

# Load the tokenizer
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the label encoder
with open(label_encoder_path, 'rb') as handle:
    label_encoder = pickle.load(handle)

**YOLO Model**

In [None]:
from ultralyticsplus import YOLO, render_result

model_2 = YOLO('foduucom/table-detection-and-extraction')
# set model parameters
model_2.overrides['conf'] = 0.25  # NMS confidence threshold
model_2.overrides['iou'] = 0.45  # NMS IoU threshold
model_2.overrides['agnostic_nms'] = False  # NMS class-agnostic
model_2.overrides['max_det'] = 1000  # maximum number of detections per image

**TF Model**

In [None]:
import os
from transformers import AutoProcessor, AutoModelForCausalLM

#model_id = "yifeihu/TF-ID-large" # recommended: use large models for better performance
model_id = "yifeihu/TF-ID-base"
# model_id = "yifeihu/TF-ID-large-no-caption" # recommended: use large models for better performance
# model_id = "yifeihu/TF-ID-base-no-caption"
model_3 = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

def image_model(img):
    prompt = "<OD>"

    inputs = processor(text=prompt, images=img, return_tensors="pt")

    generated_ids = model_3.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        do_sample=False,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

    parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(img.width, img.height))

    return parsed_answer


# **Results**

In [None]:
#PDF PAGE TO IMAGE
from pdf2image import convert_from_path

def pdf_page_to_image(pdf_path, page_number):

    image = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
    return image[0]

In [None]:
#Model for figures
def tf_id_detection(image, model, processor):
    prompt = "<OD>"
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model_3.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      do_sample=False,
      num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    annotation = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))
    return annotation["<OD>"]

# **PDF**

In [None]:
import fitz
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
import re, os

PATH_TO_PDF = 'pdf_name.pdf'

def remove_non_characters(text):
    # regular expression pattern
    pattern = r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    # List for the PDF  Text
    text_list = []

    references_index = 0
    search_words = ["kaynaklar", "literatür listesi", "kaynaklar dizini", "kaynakça", "referanslar", "bibliyografya"]

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        page_text = page.get_text()
        text_list.append(page_text)

    # ALgorithm to find References Page
    for page_num in range(len(pdf_document)-1, 0, -1):
      page = pdf_document.load_page(page_num)
      page_text = page.get_text()

      lines = page_text.split('\n')
      lines = [line for line in lines if line.strip()]
      for line in range(min(len(lines),5)):
          if remove_non_characters(lines[line]).strip().lower() in search_words:
              references_index = page_num+1
              pdf_document.close()
              return text_list, references_index

    pdf_document.close()
    return text_list, references_index


new_texts, references_index = extract_text_from_pdf(PATH_TO_PDF)
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=200, padding='post', truncating='post')

predictions = tf.nn.sigmoid(model_1.predict(new_padded_sequences)).numpy()
predicted_labels = (predictions > 0.5).astype(int)
predicted_labels = label_encoder.inverse_transform(predicted_labels.flatten())

output_list = []


# Calculating Beginning and end
length = len(predicted_labels)
print(length)
start = min(int(length * 0.2), 20)

# If not found then the end will be last 20%
if references_index == 0 or references_index < int(length * 0.5):
    end = int(length * 0.8)

    #update ekler
    i = len(predicted_labels) - 1
    while(predicted_labels[i] != 1):
      predicted_labels[i] = 1
      i -= 1

else:
    # Found
    end = references_index - 1
    found = True
    for i in range(end, len(predicted_labels)):
        predicted_labels[i] = 1


print(f'length:{length}, start: {start}, end: {end + 1}')

# First loop to check tables
for i in range(start, end):
    if(predicted_labels[i] == 1):
        image_to_check = pdf_page_to_image(PATH_TO_PDF, i + 1)
        # Call the model with this page
        if len(model_2.predict(image_to_check)[0].boxes) != 0:
          predicted_labels[i] = 0
        # If nothing found by second model call third
        elif len(tf_id_detection(image_to_check, model_2, processor)['bboxes']) != 0:
          predicted_labels[i] = 0



for i in range(len(predicted_labels)):
  if predicted_labels[i] == 1:
    output_list.append(i+1)


print(f"Toplam sayfa sayısı: {len(predicted_labels)}")
print(output_list)

# **Prediction PDFs**

In [None]:
import fitz
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
import re, os

PATH_TO_PDF = 'folder_path'

def remove_non_characters(text):
    # regular expression pattern
    pattern = r'[^a-zA-ZçÇğĞıİöÖşŞüÜ\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)

    # List for the PDF  Text
    text_list = []

    references_index = 0
    search_words = ["kaynaklar", "literatür listesi", "kaynaklar dizini", "kaynakça", "referanslar", "bibliyografya"]

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        page_text = page.get_text()
        text_list.append(page_text)

    # ALgorithm to find References Page
    for page_num in range(len(pdf_document)-1, 0, -1):
      page = pdf_document.load_page(page_num)
      page_text = page.get_text()

      lines = page_text.split('\n')
      lines = [line for line in lines if line.strip()]
      for line in range(min(len(lines),5)):
          if remove_non_characters(lines[line]).strip().lower() in search_words:
              references_index = page_num+1
              pdf_document.close()
              return text_list, references_index

    pdf_document.close()
    return text_list, references_index

for pdf_file in os.listdir(PATH_TO_PDF):
    new_texts, references_index = extract_text_from_pdf(PATH_TO_PDF + "/" + pdf_file)
    new_sequences = tokenizer.texts_to_sequences(new_texts)
    new_padded_sequences = pad_sequences(new_sequences, maxlen=200, padding='post', truncating='post')

    predictions = tf.nn.sigmoid(model_1.predict(new_padded_sequences)).numpy()
    predicted_labels = (predictions > 0.5).astype(int)
    predicted_labels = label_encoder.inverse_transform(predicted_labels.flatten())

    output_list = []


    # Calculating Beginning and nend
    length = len(predicted_labels)
    print(length)
    start = min(int(length * 0.2), 20)

    # If not found then the end will be last 20%
    if references_index == 0 or references_index < int(length * 0.5):
        end = int(length * 0.8)

        #update ekler
        i = len(predicted_labels) - 1
        while(predicted_labels[i] != 1):
          predicted_labels[i] = 1
          i -= 1

    else:
        # Found
        end = references_index - 1
        found = True
        for i in range(end, len(predicted_labels)):
            predicted_labels[i] = 1


    print(f'length:{length}, start: {start}, end: {end + 1}')

    # First loop to check tables
    for i in range(start, end):
        if(predicted_labels[i] == 1):
            image_to_check = pdf_page_to_image(PATH_TO_PDF + "/" + pdf_file, i + 1)
            # Call the model with this page
            if len(model_2.predict(image_to_check)[0].boxes) != 0:
              predicted_labels[i] = 0
            # If nothing found by second model call third
            elif len(tf_id_detection(image_to_check, model_2, processor)['bboxes']) != 0:
              predicted_labels[i] = 0



    for i in range(len(predicted_labels)):
      if predicted_labels[i] == 1:
        output_list.append(i+1)


    print(f"Toplam sayfa sayısı: {len(predicted_labels)}")
    print(output_list)