In [None]:
import fitz
import re
import json

In [None]:
footer_pattern = r"\(\d+\) (DO [LC] \d+ de \d{1,2}\.\d{1,2}\.\d{4}, p\. \d+\.)"
article_split_pattern = r"^(?=Artículo \d+$)"
wrong_new_lines_pattern = r"(?<=[a-z,])\n(?=[a-z](?!\)))"
parentheses_enum_fix = r"(?<=[\w\d])\n(?=\))"
parentheses_split_pattern = r"^(?=[\d]+\)$\n)"

In [None]:
def clean_source_text(source_text: str):
    source_text = re.sub(wrong_new_lines_pattern, " ", source_text)
    source_text = re.sub(footer_pattern, "\n", source_text)
    source_text = re.sub(wrong_new_lines_pattern, " ", source_text)
    source_text = re.sub(parentheses_enum_fix, "", source_text)
    return source_text


def split_by_article(source_text: str) -> dict:
    # First block is preamble
    articles = re.split(article_split_pattern, source_text, flags=re.MULTILINE)[1:]
    articles = [article.strip() for article in articles]
    articles_dict = {f"articulo_{i+1}": art for i, art in enumerate(articles)}
    return articles_dict


def split_by_parentheses_enum_list(source_text: str):
    # First block is preamble
    elems = re.split(parentheses_split_pattern, source_text, flags=re.MULTILINE)[1:]
    elems = [elem.strip() for elem in elems]
    elems_dict = {i: art for i, art in enumerate(elems)}
    return elems_dict

# Original document

In [None]:
with fitz.open('../data/directiva_residuos/2008_98_ce_boetxt.pdf') as doc:
    out_text_2008= ""
    for page in doc:
        text = page.get_text()
        out_text_2008 = out_text_2008 + "\n\n" + text

## Dev area

In [None]:
fix_n_text_2008 = re.sub(wrong_new_lines_pattern, ' ', out_text_2008)

In [None]:
# First block is preamble
articles_2008 = re.split(article_split_pattern, fix_n_text_2008, flags=re.MULTILINE)
articles_2008 = [article.strip() for article in articles_2008]

In [None]:
len(articles_2008)

In [None]:
print(articles_2008[5])

In [None]:
print(articles_2008[5])

## Module calls

In [None]:
# dense calls
out_text_2008_clean = clean_source_text(out_text_2008)
articles_2008 = split_by_article(out_text_2008_clean)

In [None]:
articles_2008_dict = {f"articulo_{i+1}":art for i, art in enumerate(articles_2008[1:])}

In [None]:
with open('../data/directiva_residuos/articles_2008.json', "w") as json_file:
    json.dump(articles_2008_dict, json_file)

# Modifications document

In [None]:
with fitz.open('../data/directiva_residuos/2018_851_boetxt.pdf') as doc:
    out_text_2018 = ""
    for page in doc:
        text = page.get_text()
        out_text_2018 = out_text_2018 + "\n\n" + text

## Dev area

In [None]:
fix_n_text_2018 = re.sub(wrong_new_lines_pattern, ' ', out_text_2018)
fix_n_text_2018 = re.sub(parentheses_enum_fix, '', fix_n_text_2018)

In [None]:
# First block is preamble
articles_2018 = re.split(article_split_pattern, fix_n_text_2018, flags=re.MULTILINE)
articles_2018 = [article.strip() for article in articles_2018]

In [None]:
len(articles_2018)

In [None]:
# First block is preamble
mods_2018 = re.split(parentheses_split_pattern, articles_2018[1], flags=re.MULTILINE)
mods_2018 = [modifications.strip() for modifications in mods_2018]

In [None]:
len(mods_2018)

In [None]:
print(mods_2018[31])

## Module calls

In [None]:
# dense calls
out_text_2018_clean = clean_source_text(out_text_2018)
articles_2018 = split_by_article(out_text_2018_clean)
mods_2018 = split_by_parentheses_enum_list(articles_2018["articulo_1"])

In [None]:
mods_2018_dict = {i:art for i, art in enumerate(mods_2018[1:])}

In [None]:
with open('../data/directiva_residuos/mods_2018.json', "w") as json_file:
    json.dump(mods_2018_dict, json_file)