In [13]:
import fitz
import re
import layoutparser as lp
from pdfminer.high_level import extract_pages
from pdfminer.layout import *
import textwrap
import math
import csv
import pandas as pd

pdf_path = "civil-code.pdf"

In [14]:
def extract_sentences_with_starting_words(pdf_path, starting_words):
    doc = fitz.open(pdf_path)
    word_sentences = [[] for _ in range(len(starting_words))]
    current_word = None
    previous_line_empty = False
    previous_line_word = None

    for page in doc:
        text = page.get_text()
        lines = text.splitlines()
        for line in lines:
            if line.strip():  # Check if the line is not empty
                # Check if the line starts with any of the starting words
                for i, word in enumerate(starting_words):
                    if line.lower().startswith(word.lower()) and line[0].isupper():
                        current_word = i
                        word_sentences[current_word].append(line.strip())
                        previous_line_word = i
                        break
                else:
                    # Append the line to the current sentence if a starting word is detected in the previous line
                    if current_word is not None:
                        word_sentences[current_word][-1] += ' ' + line.strip()
                        previous_line_word = current_word
            elif previous_line_word is not None and not previous_line_empty:
                # End the sentence if the current line is empty and the previous line wasn't empty
                current_word = None
                previous_line_word = None

            previous_line_empty = not line.strip()  # Update the flag for the previous line

    doc.close()
    return word_sentences

In [85]:
starting_words = ["Titre", "Title", "Chapitre", "Chapter", "Section", "Sous-section", "Article"]
# Extract sentences with specified starting words
word_sentences = extract_sentences_with_starting_words(pdf_path, starting_words)
l1 = []
# Print the sentences for each starting word
for i, word in enumerate(starting_words):
    for sentence in word_sentences[i]:
        l = (word, sentence)
        l1.append(l)

In [104]:
def extract_text_by_fontsize(pdf_url):
    extracted_text = ""
    curr_font = None
    curr_size = None
    font_attr = []

    for page_layout in extract_pages(pdf_url):
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):
                for line in element:
                    thisline = []
                    for char in line:
                        if isinstance(char, LTChar):
                            ft = char.fontname
                            sz = math.ceil(char.size)
                            x = char.bbox[0]
                            
                            if ft != curr_font or sz != curr_size:

                                l = line.get_text()
                                thisline.append(l[:-1])
                                thisline.append(ft)
                                thisline.append(sz)
                                thisline.append(x)

                                font_attr.append(thisline)
                                #print(thisline)

                                curr_font = ft
                                curr_size = sz                
                        break
    return font_attr

pdf_text = extract_text_by_fontsize(pdf_path)


In [128]:
def classify_text(font_name, font_size, x):
  # Implement your classification rules here based on font_name, font_size, and y (position)
  if "Bold" in font_name and (font_size > 14 or x > 100):  # Adjust threshold based on your PDFs
    return "Title"
  elif "Bold" in font_name and font_size > 13:
    return "Section"
  elif font_size > 8:
    return "Paragraph"
  else:
    return "Footer"

def process_pdf(pdf_path):
  l2 = []
  for line_attr in pdf_text:
    
    text = line_attr[0]
    font_name = line_attr[1]
    font_size = line_attr[2]
    x = line_attr[3]
    category = classify_text(font_name, font_size, x)
    # Do something with the classified text and category
    #print(f"{text}, Category: {category}")
    l = (category, text)
    l2.append(l)
  return l2

In [129]:
l2 = process_pdf(pdf_path)
l2

[('Title', 'Code civil'),
 ('Section', 'Article 1'),
 ('Paragraph',
  "Les lois et, lorsqu'ils sont publiés au Journal officiel de la République française, les actes administratifs"),
 ('Section', 'Article 2'),
 ('Paragraph',
  "La loi ne dispose que pour l'avenir ; elle n'a point d'effet rétroactif."),
 ('Section', 'Article 3'),
 ('Paragraph',
  'Les lois de police et de sûreté obligent tous ceux qui habitent le territoire.'),
 ('Section', 'Article 4'),
 ('Paragraph',
  "Le juge qui refusera de juger, sous prétexte du silence, de l'obscurité ou de l'insuffisance de la loi, pourra être"),
 ('Section', 'Article 5'),
 ('Footer',
  'Code civil - Dernière modification le 21 mai 2023 - Document généré le 22 mai 2023'),
 ('Paragraph',
  'Il est défendu aux juges de prononcer par voie de disposition générale et réglementaire sur les causes qui leur'),
 ('Section', 'Article 6'),
 ('Paragraph',
  "On ne peut déroger, par des conventions particulières, aux lois qui intéressent l'ordre public et 

In [130]:
def comp(l1, l2):
    # Convert list1 to a set of sentences for efficient comparison
    set1 = set([(category, sentence) for category, sentence in l1])

    combined_sentences = l1[:]
    
    temp = []
    for c, s in l1:
        temp.append(s)
    
    for category, sentence in l2:
    
        if sentence not in temp:
            combined_sentences.append((category, sentence))

    return combined_sentences

yos = comp(l1, l2)

In [131]:
yos[3391:]

[('Title', 'Code civil'),
 ('Paragraph',
  "Les lois et, lorsqu'ils sont publiés au Journal officiel de la République française, les actes administratifs"),
 ('Paragraph',
  "La loi ne dispose que pour l'avenir ; elle n'a point d'effet rétroactif."),
 ('Paragraph',
  'Les lois de police et de sûreté obligent tous ceux qui habitent le territoire.'),
 ('Paragraph',
  "Le juge qui refusera de juger, sous prétexte du silence, de l'obscurité ou de l'insuffisance de la loi, pourra être"),
 ('Section', 'Article 5'),
 ('Footer',
  'Code civil - Dernière modification le 21 mai 2023 - Document généré le 22 mai 2023'),
 ('Paragraph',
  'Il est défendu aux juges de prononcer par voie de disposition générale et réglementaire sur les causes qui leur'),
 ('Paragraph',
  "On ne peut déroger, par des conventions particulières, aux lois qui intéressent l'ordre public et les bonnes"),
 ('Paragraph',
  'Le mariage et la filiation adoptive emportent les mêmes effets, droits et obligations reconnus par les 

In [132]:
def make_map(suchi):
    naksha = {}
    for i, (c, s)  in enumerate(suchi):
        naksha[i] = {"Category": c, "Text":s}

    return naksha

naksha = make_map(yos)


In [133]:
with open("output.json", "w") as outfile:
    json.dump(naksha, outfile,indent=2)