In [1]:
import fitz
from pdfminer.high_level import extract_pages
from pdfminer.layout import *
import math
import json
import random

pdf_path = "civil-code.pdf"

In [4]:
def extract_sentences_with_starting_words(pdf_path, starting_words):
    doc = fitz.open(pdf_path)
    word_sentences = [[] for _ in range(len(starting_words))]
    current_word = None
    previous_line_empty = False
    previous_line_word = None

    for page in doc:
        text = page.get_text()
        lines = text.splitlines()
        for line in lines:
            if line.strip():  # Check if the line is not empty
                # Check if the line starts with any of the starting words
                for i, word in enumerate(starting_words):
                    if line.lower().startswith(word.lower()) and line[0].isupper():
                        current_word = i
                        word_sentences[current_word].append(line.strip())
                        previous_line_word = i
                        break
                else:
                    # Append the line to the current sentence if a starting word is detected in the previous line
                    if current_word is not None:
                        word_sentences[current_word][-1] += ' ' + line.strip()
                        previous_line_word = current_word
            elif previous_line_word is not None and not previous_line_empty:
                # End the sentence if the current line is empty and the previous line wasn't empty
                current_word = None
                previous_line_word = None

            previous_line_empty = not line.strip()  # Update the flag for the previous line

    doc.close()
    return word_sentences

In [5]:
starting_words = ["Titre", "Title", "Chapitre", "Chapter", "Section", "Sous-section", "Article"]
# Extract sentences with specified starting words
word_sentences = extract_sentences_with_starting_words(pdf_path, starting_words)
l1 = []
# Print the sentences for each starting word
for i, word in enumerate(starting_words):
    for sentence in word_sentences[i]:
        l = (word, sentence)
        l1.append(l)

In [6]:
def concat_strings(input_list):
    result_list = []
    temp_string = ""
    prev_attributes = None

    for item in input_list:
        text, font, size, float_val = item
        attributes = (font, size, float_val)

        if attributes == prev_attributes:
            temp_string += " " + text
        else:
            if temp_string:
                result_list.append([temp_string.strip()] + list(prev_attributes))
            temp_string = text
            prev_attributes = attributes

    # Append the last concatenated string
    if temp_string:
        result_list.append([temp_string.strip()] + list(prev_attributes))

    return result_list


def extract_text_by_fontsize(pdf_url):
    extracted_text = ""
    curr_font = None
    curr_size = None
    font_attr = []

    for page_layout in extract_pages(pdf_url):
        for element in page_layout:
            for line in element:
                thisline = []
                if isinstance(line, LTTextLine):
                    for char in line:
                        if isinstance(char, LTChar):
                            ft = char.fontname
                            sz = math.ceil(char.size)
                            x = char.bbox[0]

                            #if ft != curr_font or sz != curr_size:

                            l = line.get_text()
                            thisline.append(l[:-1])
                            thisline.append(ft)
                            thisline.append(sz)
                            thisline.append(x)
                            font_attr.append(thisline)
                            #print(thisline)

                                #curr_font = ft
                                #curr_size = sz                
                        break

    result_list = concat_strings(font_attr)
    return result_list

pdf_text = extract_text_by_fontsize(pdf_path)


In [26]:
def classify_text(previous_category, font_name, font_size, x):
    # Define transition probabilities
    transition_probs = {
        "Title": {"Title": 0.1, "Section": 0.7, "Paragraph": 0.2},
        "Section": {"Title": 0.1, "Section": 0.1, "Paragraph": 0.8},
        "Paragraph": {"Title": 0.2, "Section": 0.4, "Paragraph": 0.4}
#        "Footer": {"Title": 0.4, "Section": 0.3, "Paragraph": 0.3}
    }

    # Define the probabilities for each state
    state_probs = {
        "Title": 0.15,
        "Section": 0.425,
        "Paragraph": 0.425,
#        "Footer": 0.2  # Assuming footers are always non-paragraphs
    }

    # Calculate the probability of being in each state
    prob_sum = sum(transition_probs[previous_category].values())
    probs = {category: transition_probs[previous_category][category] / prob_sum for category in transition_probs[previous_category]}
    
    # Calculate the probability of being a paragraph
    paragraph_prob = probs["Paragraph"] * state_probs["Paragraph"]

    # Generate a random number to decide the category
    rand_num = random.random()

    if rand_num < paragraph_prob:
        return "Paragraph"
    else:
        # Choose a non-paragraph category randomly based on transition probabilities
        non_paragraphs = ["Title", "Section"] #, "Footer"
        non_paragraph_probs = [probs[category] * state_probs[category] for category in non_paragraphs]
        non_paragraph_probs_sum = sum(non_paragraph_probs)
        non_paragraph_probs = [prob / non_paragraph_probs_sum for prob in non_paragraph_probs]
        return random.choices(non_paragraphs, weights=non_paragraph_probs)[0]


def process_pdf(pdf_text):
    l2 = []
    previous_category = "Title"  # Initial category assumed to be a title
    
    for line_attr in pdf_text:
        text = line_attr[0]
        font_name = line_attr[1]
        font_size = line_attr[2]
        x = line_attr[3]
        
        category = classify_text(previous_category, font_name, font_size, x)

        l2.append((category, text))  # Append the category and text to l2
        previous_category = category  # Update the previous category for the next iteration

    return l2


In [27]:
l2 = process_pdf(pdf_text)
l2

[('Section', 'Code civil'),
 ('Section',
  "Titre préliminaire : De la publication, des effets et de l'application des lois en général"),
 ('Title', 'Article 1'),
 ('Title',
  "Les lois et, lorsqu'ils sont publiés au Journal officiel de la République française, les actes administratifs entrent en vigueur à la date qu'ils fixent ou, à défaut, le lendemain de leur publication. Toutefois, l'entrée en vigueur de celles de leurs dispositions dont l'exécution nécessite des mesures d'application est reportée à la date d'entrée en vigueur de ces mesures. En cas d'urgence, entrent en vigueur dès leur publication les lois dont le décret de promulgation le prescrit et les actes administratifs pour lesquels le Gouvernement l'ordonne par une disposition spéciale. Les dispositions du présent article ne sont pas applicables aux actes individuels."),
 ('Section', 'Article 2'),
 ('Paragraph',
  "La loi ne dispose que pour l'avenir ; elle n'a point d'effet rétroactif."),
 ('Section', 'Article 3'),
 ('Ti

In [28]:
def comp(l1, l2):
    # Convert list1 to a set of sentences for efficient comparison
    set1 = set([(category, sentence) for category, sentence in l1])

    combined_sentences = l1[:]
    
    temp = []
    for c, s in l1:
        temp.append(s)
    
    for category, sentence in l2:
    
        if sentence not in temp:
            combined_sentences.append((category, sentence))

    return combined_sentences

yos = comp(l2, l1)

In [29]:
yos[:7293]

[('Section', 'Code civil'),
 ('Section',
  "Titre préliminaire : De la publication, des effets et de l'application des lois en général"),
 ('Title', 'Article 1'),
 ('Title',
  "Les lois et, lorsqu'ils sont publiés au Journal officiel de la République française, les actes administratifs entrent en vigueur à la date qu'ils fixent ou, à défaut, le lendemain de leur publication. Toutefois, l'entrée en vigueur de celles de leurs dispositions dont l'exécution nécessite des mesures d'application est reportée à la date d'entrée en vigueur de ces mesures. En cas d'urgence, entrent en vigueur dès leur publication les lois dont le décret de promulgation le prescrit et les actes administratifs pour lesquels le Gouvernement l'ordonne par une disposition spéciale. Les dispositions du présent article ne sont pas applicables aux actes individuels."),
 ('Section', 'Article 2'),
 ('Paragraph',
  "La loi ne dispose que pour l'avenir ; elle n'a point d'effet rétroactif."),
 ('Section', 'Article 3'),
 ('Ti

In [30]:
def make_map(suchi):
    naksha = {}
    for i, (c, s)  in enumerate(suchi):
        naksha[i] = {"Category": c, "Text":s}

    return naksha

naksha = make_map(yos[:7293])


In [32]:
with open("markov_output.json", "w", encoding = 'utf-8') as outfile:
    json.dump(naksha, outfile,indent=2, ensure_ascii=False)