In [1]:
# Load required xml libraries and the file called 19410716_041U0633_VIGENZA_20220922.xml
# The file is in the akoma ntoso format
import xml.etree.ElementTree as ET
import pandas as pd
from json import dumps
from akn_to_owl.parser import extract_articles, extract_chapters, transform_intro_points, get_intro, get_point
from akn_to_owl.parser import split_art_para_p
from akn_to_owl.functions import extract_text, get_insertions_and_references
import re



In [2]:
root = ET.parse('data/akn/19410716_041U0633_VIGENZA_20220922.xml').getroot()

In [3]:
chapters = extract_chapters(root)
chapters.to_csv('data/csv/chapters.csv', index=False)
chapters.head()

Unnamed: 0,article_id,chapter_id,chapter_heading
0,art_1,chp_I,DISPOSIZIONI SUL DIRITTO DI AUTORE CAPO I Ope...
1,art_2,chp_I,DISPOSIZIONI SUL DIRITTO DI AUTORE CAPO I Ope...
2,art_3,chp_I,DISPOSIZIONI SUL DIRITTO DI AUTORE CAPO I Ope...
3,art_4,chp_I,DISPOSIZIONI SUL DIRITTO DI AUTORE CAPO I Ope...
4,art_5,chp_I,DISPOSIZIONI SUL DIRITTO DI AUTORE CAPO I Ope...


In [4]:
articles = extract_articles(root)

In [5]:
def build_line(article_id, paragraph_id, p_id, p_text, insertions, references):
    line = {
        'article_id': article_id,
        'paragraph_id': paragraph_id,
        'p_id': p_id,
        'text': p_text,
        'insertions': insertions,
        'references': references,
        #'list': list
    }
    return line

In [6]:
def get_references(paragraph):
    references = []
    for reference in paragraph.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}ref"):        
        reference_id = reference.attrib['eId']
        reference_link = reference.attrib['href']
        reference_text = extract_text(reference)

        # Add the reference to the list
        references.append((reference_id, reference_link, reference_text))
    return references

        


In [7]:
def get_insertions(paragraph):
    insertions = []
    for insertion in paragraph.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}ins"):
        insertion_id = insertion.attrib['eId']
        insertion_text = extract_text(insertion)
        insertions.append((insertion_id, insertion_text))
    return insertions


In [8]:
def extract_p(article_id, paragraph_id, paragraph):
    p_list = []
    elements = paragraph.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p")
    # Loop through the elements with an index
    for index, element in enumerate(elements):
        # Assign the p_id
        p_id = "p_" + str(index + 1)
        p_text = extract_text(element)
        insertions = get_insertions(element)
        references = get_references(element)
        p_info = build_line(article_id, paragraph_id, p_id, p_text, insertions, references)
        p_list.append(p_info)
    
    return p_list

In [9]:
# Extract the paragraphs from the articles and save them in a dataframe called paragraph_list
def extract_paragraphs(articles):
    paragraph_list = pd.DataFrame(columns=['article_id', 'paragraph_id', 'p_id', 'text', 'insertions', 'references'])
    
    for article in articles:
        article_id = article.attrib['eId']
        
        # Check how many paragraphs are in the article. 
        # If there is only one, then the id is para_1
        if len(article.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}paragraph")) == 1:
            
            # Assign the paragraph_id
            paragraph_id = "para_1"
            
            # Extract the paragraph
            paragraph = article.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}paragraph")
            
            p_list = extract_p(article_id, paragraph_id, paragraph)            
            
            # Convert the list to a dataframe
            p_list = pd.DataFrame(p_list, columns=['article_id', 'paragraph_id', 'p_id', 'text', 'insertions', 'references'])

            # Append the dataframe to the paragraph_list
            paragraph_list = pd.concat([paragraph_list, p_list], ignore_index=True)
            
            # Here we are missing: the p level, whether it is a list or not
                        
        # If there are multiple paragraphs, then loop through them
        else:
            # Loop through all the paragraphs
            # Set up a counter for the paragraphs
            paragraph_counter = 0
            for paragraph in article.iter('{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}paragraph'):                
                # If the paragraph is empty or has only a double parenthesis, then continue with the next paragraph
                if extract_text(paragraph) == "((" or extract_text(paragraph) == "))":
                    # Continue with the next paragraph
                    continue                
                    
                # If the paragraph has an eId, then use it
                if paragraph.get('eId') is not None:
                    paragraph_id = paragraph.attrib['eId'].split('__')[1]
                    
                    p_list = extract_p(article_id, paragraph_id, paragraph)            
                    # Convert the list to a dataframe
                    p_list = pd.DataFrame(p_list, columns=['article_id', 'paragraph_id', 'p_id', 'text', 'insertions', 'references'])

                    # Append the dataframe to the paragraph_list
                    paragraph_list = pd.concat([paragraph_list, p_list], ignore_index=True)

                    # Increase the counter
                    provisional_para = paragraph_id.split('_')[1]
                    # If there is a - in the paragraph_id, then we need to split it
                    if '-' in provisional_para:
                        # @TODO: This is not working properly - it would be better to evaluate the various cases (bis, ter, quater)
                        paragraph_counter = int(provisional_para.split('-')[0])
                    else:
                        paragraph_counter = int(provisional_para)
                else:
                    # Increase the counter
                    paragraph_counter += 1
                    # Assign the paragraph_id
                    paragraph_id = "para_" + str(paragraph_counter)
                    p_list = extract_p(article_id, paragraph_id, paragraph)            
                    # Convert the list to a dataframe
                    p_list = pd.DataFrame(p_list, columns=['article_id', 'paragraph_id', 'p_id', 'text', 'insertions', 'references'])

                    # Append the dataframe to the paragraph_list
                    paragraph_list = pd.concat([paragraph_list, p_list], ignore_index=True)

    return paragraph_list



In [10]:
# Get the articles from the XML root - parse Akoma Ntoso XML file
paragraphs = extract_paragraphs(articles)

In [11]:
# Further transform the articles to get a dataframe with each p element in a row, while preserving all the needed information
# For semantically connecting each row at the p level to the Copyright ontology
df = pd.DataFrame(paragraphs)


In [12]:
# Show the first 50 rows of the dataframe
df.head(50)

Unnamed: 0,article_id,paragraph_id,p_id,text,insertions,references
0,art_1,para_1,p_1,Sono protette ai sensi di questa legge le oper...,"[(ins_1, ((, nonche' le banche di dati che per...","[(content__ref_1, /akn/it/act/legge/stato/1978..."
1,art_2,para_1,p_1,In particolare sono comprese nella protezione:...,"[(ins_2, ((26)))]",[]
2,art_2,para_2,p_1,-------------------,[],[]
3,art_2,para_2,p_2,AGGIORNAMENTO (26),[],[]
4,art_2,para_2,p_3,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",[],"[(2, /akn/it/act/legge/stato/2002-12-12/273/!m..."
5,art_3,para_1,p_1,"Le opere collettive, costituite dalla riunione...",[],[]
6,art_4,para_1,p_1,Senza pregiudizio dei diritti esistenti sull'o...,[],[]
7,art_5,para_1,p_1,Le disposizioni di questa legge non si applica...,[],[]
8,art_6,para_1,p_1,Il titolo originario dell'acquisto del diritto...,[],[]
9,art_7,para_1,p_1,E' considerato autore dell'opera collettiva ch...,[],[]


In [61]:
len(df)

988

In [62]:
# Check that for every combination of article_id, paragraph_id and p_id there is only one text
df.groupby(['article_id', 'paragraph_id', 'p_id']).count().sort_values(by='text', ascending=False).head(50)

# Print the text of all the lines with duplicates for article_id, paragraph_id and p_id
df[df.duplicated(subset=['article_id', 'paragraph_id', 'p_id'], keep=False)].sort_values(by=['article_id', 'paragraph_id', 'p_id'])



Unnamed: 0,article_id,paragraph_id,p_id,text,insertions,references
551,art_102-bis,para_6,p_1,((22)),[],[]
552,art_102-bis,para_6,p_1,Il diritto esclusivo del costitutore sorge al ...,[],[]
237,art_68,para_3,p_1,((53)),[],[]
238,art_68,para_3,p_1,Fermo restando il divieto di riproduzione di s...,[],[]


In [63]:
df.to_csv('data/csv/temporary_df.csv', index=False)