In [140]:
# Load required xml libraries and the file called 19410716_041U0633_VIGENZA_20220922.xml
# The file is in the akoma ntoso format
import xml.etree.ElementTree as ET
import pandas as pd
from json import dumps
from akn_to_owl.parser import extract_articles, extract_paragraphs, extract_chapters, transform_intro_points
from akn_to_owl.parser import split_art_para_p
from akn_to_owl.functions import extract_text, get_insertions_and_references
import re



In [2]:
root = ET.parse('data/akn/19410716_041U0633_VIGENZA_20220922.xml').getroot()

In [3]:
#chapters = extract_chapters(root)
#chapters.to_csv('data/csv/chapters.csv', index=False)
#chapters.head()

In [4]:
articles = extract_articles(root)

In [151]:
# Processes the articles got as a consequence of the extract_articles function
# And launches it to the next function, process_paragraph
# It returns a list of paragraphs with an unique id associated to the XML structure
# Of the AKN file
def extract_paragraphs(articles):
    paragraph_list = {
        'article_id': [],
        'paragraph_id': [],
        'p_id': [],
        'text': [],
        'list': [],

    }
    
    for article in articles:
        article_id = article.attrib['eId']
        
        # Check how many paragraphs are in the article. 
        # If there is only one, then the id is para_1
        if len(article.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}paragraph")) == 1:
            
            # Assign the paragraph_id
            paragraph_id = "para_1"
            
            # Extract the paragraph
            paragraph = article.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}paragraph")
            
            # Verify whether the paragraph has a list tag inside
            if paragraph.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}list") is not None:
                
                # Check whether there are multiple p tags inside the paragraph - this should be false for all lists
                if len(paragraph.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p")) == 1:
                    
                    # Assign the p_id
                    p_id = "p_1"
                    
                    # Extract the text
                    p_text = extract_text(paragraph.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p"))

                    # Add the values to the paragraph_list
                    paragraph_list['article_id'].append(article_id)
                    paragraph_list['paragraph_id'].append(paragraph_id)
                    paragraph_list['p_id'].append(p_id)
                    paragraph_list['text'].append(p_text)
                    paragraph_list['list'].append(True)
                    continue
                else:
                    p_counter = 0
                    for p in paragraph.iter('{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p'):
                        
                        # Increase the counter
                        p_counter += 1
                        p_id = "p_" + str(p_counter)
                        p_text = extract_text(p)

                        # Add the values to the paragraph_list
                        paragraph_list['article_id'].append(article_id)
                        paragraph_list['paragraph_id'].append(paragraph_id)
                        paragraph_list['p_id'].append(p_id)
                        paragraph_list['text'].append(p_text)
                        paragraph_list['list'].append(True)
                        continue


            else:

                # Check whether there are multiple p tags inside the paragraph
                if len(paragraph.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p")) == 1:
                    
                    # Assign the p_id
                    p_id = "p_1"
                    
                    # Extract the text
                    p_text = extract_text(paragraph.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p"))

                    # Add the values to the paragraph_list
                    paragraph_list['article_id'].append(article_id)
                    paragraph_list['paragraph_id'].append(paragraph_id)
                    paragraph_list['p_id'].append(p_id)
                    paragraph_list['text'].append(p_text)
                    paragraph_list['list'].append(False)
                    continue
                else:
                    p_counter = 0
                    for p in paragraph.iter('{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p'):
                        
                        # Increase the counter
                        p_counter += 1
                        p_id = "p_" + str(p_counter)
                        p_text = extract_text(p)

                        # Add the values to the paragraph_list
                        paragraph_list['article_id'].append(article_id)
                        paragraph_list['paragraph_id'].append(paragraph_id)
                        paragraph_list['p_id'].append(p_id)
                        paragraph_list['text'].append(p_text)
                        paragraph_list['list'].append(False)  
                        continue
                        
        # If there are multiple paragraphs, then loop through them
        else:
            # Loop through all the paragraphs
            # Set up a counter for the paragraphs
            paragraph_counter = 0
            for paragraph in article.iter('{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}paragraph'):
                
                # If the paragraph is empty or has only a double parenthesis, then continue with the next paragraph
                if extract_text(paragraph) == "((" or extract_text(paragraph) == "))":
                    # Continue with the next paragraph
                    continue
                
                # Increase the counter
                paragraph_counter += 1
                
                # If the paragraph has an eId, then use it
                if paragraph.get('eId') is not None:
                    
                    paragraph_id = paragraph.attrib['eId'].split('__')[1]
                    
                    # @ Todo: there are duplicates for when a number that links to an aggiornamento ex: ((88)) is within the text.
                    # This creates conflicts when assigning the paragraph_id.
                    # It needs to be solved. 
                    
                    
                    # if the paragraph has a list tag inside explore
                    if paragraph.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}list") is not None:
                        # Check whether there are multiple p tags inside the paragraph
                        if len(paragraph.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p")) == 1:
                            
                            # Assign the p_id
                            p_id = "p_1"
                            
                            # Extract the text
                            p_text = extract_text(paragraph.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p"))

                            # Add the values to the paragraph_list
                            paragraph_list['article_id'].append(article_id)
                            paragraph_list['paragraph_id'].append(paragraph_id)
                            paragraph_list['p_id'].append(p_id)
                            paragraph_list['text'].append(p_text)
                            paragraph_list['list'].append(True)
                            continue
                        else:
                            p_counter = 0
                            for p in paragraph.iter('{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p'):
                                
                                # Increase the counter
                                p_counter += 1
                                p_id = "p_" + str(p_counter)
                                p_text = extract_text(p)

                                # Add the values to the paragraph_list
                                paragraph_list['article_id'].append(article_id)
                                paragraph_list['paragraph_id'].append(paragraph_id)
                                paragraph_list['p_id'].append(p_id)
                                paragraph_list['text'].append(p_text)
                                paragraph_list['list'].append(True)   
                                continue
                    else:
                        # If there are exactly one p tag inside the paragraph, then use it
                        if len(paragraph.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p")) == 1:
                            p_id = "p_1"
                            # add the article_id and paragraph_id to the paragraph_list
                            paragraph_list['article_id'].append(article_id)
                            paragraph_list['paragraph_id'].append(paragraph_id)
                            paragraph_list['p_id'].append(p_id)
                            p_text = extract_text(paragraph.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p"))
                            paragraph_list['text'].append(p_text)
                            paragraph_list['list'].append(False)
                            continue
                        else:
                            p_counter = 0
                            for p in paragraph.iter('{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p'):
                                p_counter += 1
                                p_id = "p_" + str(p_counter)
                                # add the article_id and paragraph_id to the paragraph_list
                                paragraph_list['article_id'].append(article_id)
                                paragraph_list['paragraph_id'].append(paragraph_id)
                                paragraph_list['p_id'].append(p_id)
                                p_text = extract_text(p)
                                paragraph_list['text'].append(p_text)
                                paragraph_list['list'].append(False)
                                continue
                # If the paragraph does not have an eId, then some elements needs to be verified
                else:                    
                    paragraph_counter += 1
                    paragraph_id = "para_" + str(paragraph_counter)

                    # If there are exactly one p tag inside the paragraph, then use it
                    if len(paragraph.findall(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p")) == 1:
                        p_id = "p_1"
                        # add the article_id and paragraph_id to the paragraph_list
                        paragraph_list['article_id'].append(article_id)
                        paragraph_list['paragraph_id'].append(paragraph_id)
                        paragraph_list['p_id'].append(p_id)
                        p_text = extract_text(paragraph.find(".//{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p"))
                        paragraph_list['text'].append(p_text)
                        paragraph_list['list'].append(False)
                        continue
                    else:
                        p_counter = 0
                        for p in paragraph.iter('{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}p'):
                            p_counter += 1
                            p_id = "p_" + str(p_counter)
                            # add the article_id and paragraph_id to the paragraph_list
                            paragraph_list['article_id'].append(article_id)
                            paragraph_list['paragraph_id'].append(paragraph_id)
                            paragraph_list['p_id'].append(p_id)
                            p_text = extract_text(p)
                            paragraph_list['text'].append(p_text)
                            paragraph_list['list'].append(False)
                            continue
            

    return paragraph_list



In [152]:
# Get the articles from the XML root - parse Akoma Ntoso XML file

paragraphs = extract_paragraphs(articles)

error: bad escape \K at position 3

In [148]:
# Further transform the articles to get a dataframe with each p element in a row, while preserving all the needed information
# For semantically connecting each row at the p level to the Copyright ontology
df = pd.DataFrame(paragraphs)


In [149]:
df.head(50)

Unnamed: 0,article_id,paragraph_id,p_id,text,list
0,art_1,para_1,p_1,Sono protette ai sensi di questa legge le oper...,False
1,art_2,para_2,p_1,In particolare sono comprese nella protezione:...,False
2,art_2,para_4,p_1,-------------------,False
3,art_2,para_4,p_2,AGGIORNAMENTO (26),False
4,art_2,para_4,p_3,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",False
5,art_3,para_1,p_1,"Le opere collettive, costituite dalla riunione...",False
6,art_4,para_1,p_1,Senza pregiudizio dei diritti esistenti sull'o...,False
7,art_5,para_1,p_1,Le disposizioni di questa legge non si applica...,False
8,art_6,para_1,p_1,Il titolo originario dell'acquisto del diritto...,False
9,art_7,para_1,p_1,E' considerato autore dell'opera collettiva ch...,False


In [150]:
# Check that for every combination of article_id, paragraph_id and p_id there is only one text
df.groupby(['article_id', 'paragraph_id', 'p_id']).count().sort_values(by='text', ascending=False).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text,list
article_id,paragraph_id,p_id,Unnamed: 3_level_1,Unnamed: 4_level_1
art_68,para_5,p_1,2,2
art_102-bis,para_7,p_1,2,2
art_1,para_1,p_1,1,1
art_64-quinquies,para_1,p_5,1,1
art_64-quater,para_1,p_3,1,1
art_64-quater,para_1,p_4,1,1
art_64-quater,para_2,p_1,1,1
art_64-quater,para_2,p_2,1,1
art_64-quater,para_2,p_3,1,1
art_64-quater,para_2,p_4,1,1


In [129]:
df.head(40)

Unnamed: 0,article_id,paragraph_id,p_id,text,list
0,art_1,para_1,p_1,Sono protette ai sensi di questa legge le oper...,False
1,art_2,para_1,p_1,In particolare sono comprese nella protezione:...,False
2,art_2,para_2,p_1,-------------------,False
3,art_2,para_2,p_2,AGGIORNAMENTO (26),False
4,art_2,para_2,p_3,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",False
5,art_3,para_1,p_1,"Le opere collettive, costituite dalla riunione...",False
6,art_4,para_1,p_1,Senza pregiudizio dei diritti esistenti sull'o...,False
7,art_5,para_1,p_1,Le disposizioni di questa legge non si applica...,False
8,art_6,para_1,p_1,Il titolo originario dell'acquisto del diritto...,False
9,art_7,para_1,p_1,E' considerato autore dell'opera collettiva ch...,False


In [132]:
df.to_csv('data/csv/temporary_df.csv', index=False)

In [7]:
# Save the colummns of the dataframe paragraph_id and p_id in a new dataframe
akoma_ntoso_key = df[['paragraph_id', 'p_id']]

# Save the dataframe as a csv file
akoma_ntoso_key.to_csv('data/csv/akoma_ntoso_key.csv', index=False)


In [8]:
# Show the lines with (( and )) in the text
df[df['text'].str.contains('\(\(')]



Unnamed: 0,article_id,paragraph_id,p_id,point_id,intro_id,text,insertions,references,ref_id,ins_id
0,art_1,art_1__para_1,art_1art_1__para_1__p_1,,,Sono protette ai sensi di questa legge le oper...,"[((, nonche' le banche di dati che per la scel...","[legge 20 giugno 1978, n. 399]",[/akn/it/act/legge/stato/1978-06-20/399/!main],[ins_1]
1,art_2,art_2__para_1,art_2art_2__para_1__p_1,,,In particolare sono comprese nella protezione:...,[((26))],[],[],[ins_2]
15,art_12-bis,art_12-bis__para_1,art_12-bisart_12-bis__para_1__p_1,,,"((Salvo patto contrario, il datore di lavoro e...","[((Salvo patto contrario, il datore di lavoro ...",[],[],[ins_3]
16,art_12-ter,art_12-ter__para_1,art_12-terart_12-ter__para_1__p_1,,,"((Salvo patto contrario, qualora un'opera di d...","[((Salvo patto contrario, qualora un'opera di ...",[],[],[ins_4]
19,art_15,art_15__para_1,art_15art_15__para_1__p_1,,,"Il diritto esclusivo di eseguire, rappresentar...",[((Non e' considerata pubblica la recitazione ...,[],[],[ins_6]
...,...,...,...,...,...,...,...,...,...,...
952,art_191,art_191__para_1,art_191art_191__para_1__p_1,,,Il Comitato e' composto: \n a) di un president...,[(( Societa' italiana degli autori ed editori ...,[],[],[ins_255]
954,art_193,art_193__para_1,art_193art_193__para_1__p_1,,,Il Comitato puo' essere convocato: a) in aduna...,[((Partecipano all'adunanza generale tutti i m...,[],[],[ins_256]
969,art_198,art_198__para_1,art_198art_198__para_1__p_1,,,Nel bilancio di previsione del Ministero della...,[((5))],[],[],[ins_258]
985,art_204,art_204__para_1,art_204art_204__para_1__p_1,,,A decorrere dall'entrata in vigore di questa l...,[(( Societa' italiana degli autori ed editori ...,[],[],[ins_260]


In [25]:
# Fixes the misalignment between the paragraph_id and the p_id columns, ## TODO: fix the function name and content
df = split_art_para_p(df, 'paragraph_id', 'p_id', "corrected_id")

# Rename the column p_id to xml_id and the column test_id to p_id
df = df.rename(columns={'p_id': 'original_id'})

# Rename the column corrected_id to p_id
df = df.rename(columns={'corrected_id': 'p_id'})

# reorder the columns of the dataframe
df = df[['article_id', 'paragraph_id', 'p_id', 'original_id','intro_id', 'point_id',  'text', 'insertions', 'references', 'ins_id', 'ref_id']]

In [26]:
df.head(30)

Unnamed: 0,article_id,paragraph_id,p_id,original_id,intro_id,point_id,text,insertions,references,ins_id,ref_id
0,art_1,art_1__para_1,art_1__para_1__p_1,art_1__para_1__p_1,,,Sono protette ai sensi di questa legge le oper...,"[((, nonche' le banche di dati che per la scel...","[legge 20 giugno 1978, n. 399]",[ins_1],[/akn/it/act/legge/stato/1978-06-20/399/!main]
1,art_2,art_2__para_1,art_2__para_1__p_1,art_2__para_1__p_1,,,In particolare sono comprese nella protezione:...,[((26))],[],[ins_2],[]
2,art_2,art_2__para_2,art_2__para_2__p_1,art_2__para_2__p_1,,,-------------------,[],[],[],[]
3,art_2,art_2__para_2,art_2__para_2__p_2,art_2__para_2__p_2,,,AGGIORNAMENTO (26),[],[],[],[]
4,art_2,art_2__para_2,art_2__para_2__p_3,art_2__para_2__p_3,,,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",[],"[L. 12 dicembre 2002, n. 273, articolo 2, nume...",[],"[/akn/it/act/legge/stato/2002-12-12/273/!main,..."
5,art_3,art_3__para_1,art_3__para_1__p_1,art_3__para_1__p_1,,,"Le opere collettive, costituite dalla riunione...",[],[],[],[]
6,art_4,art_4__para_1,art_4__para_1__p_1,art_4__para_1__p_1,,,Senza pregiudizio dei diritti esistenti sull'o...,[],[],[],[]
7,art_5,art_5__para_1,art_5__para_1__p_1,art_5__para_1__p_1,,,Le disposizioni di questa legge non si applica...,[],[],[],[]
8,art_6,art_6__para_1,art_6__para_1__p_1,art_6__para_1__p_1,,,Il titolo originario dell'acquisto del diritto...,[],[],[],[]
9,art_7,art_7__para_1,art_7__para_1__p_1,art_7__para_1__p_1,,,E' considerato autore dell'opera collettiva ch...,[],[],[],[]


In [27]:
# Trnasforms the bullet lists in a machine-readable format, coherent with the rest of the database schema
df = transform_intro_points(df)

In [28]:
df.head(30)

Unnamed: 0,article_id,paragraph_id,p_id,original_id,intro,intro_id,point,point_id,text,insertions,references,ins_id,ref_id
0,art_1,art_1__para_1,art_1__para_1__p_1,art_1__para_1__p_1,False,,False,,Sono protette ai sensi di questa legge le oper...,"[((, nonche' le banche di dati che per la scel...","[legge 20 giugno 1978, n. 399]",[ins_1],[/akn/it/act/legge/stato/1978-06-20/399/!main]
1,art_2,art_2__para_1,art_2__para_1__p_1,art_2__para_1__p_1,False,,False,,In particolare sono comprese nella protezione:...,[((26))],[],[ins_2],[]
2,art_2,art_2__para_2,art_2__para_2__p_1,art_2__para_2__p_1,False,,False,,-------------------,[],[],[],[]
3,art_2,art_2__para_2,art_2__para_2__p_2,art_2__para_2__p_2,False,,False,,AGGIORNAMENTO (26),[],[],[],[]
4,art_2,art_2__para_2,art_2__para_2__p_3,art_2__para_2__p_3,False,,False,,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",[],"[L. 12 dicembre 2002, n. 273, articolo 2, nume...",[],"[/akn/it/act/legge/stato/2002-12-12/273/!main,..."
5,art_3,art_3__para_1,art_3__para_1__p_1,art_3__para_1__p_1,False,,False,,"Le opere collettive, costituite dalla riunione...",[],[],[],[]
6,art_4,art_4__para_1,art_4__para_1__p_1,art_4__para_1__p_1,False,,False,,Senza pregiudizio dei diritti esistenti sull'o...,[],[],[],[]
7,art_5,art_5__para_1,art_5__para_1__p_1,art_5__para_1__p_1,False,,False,,Le disposizioni di questa legge non si applica...,[],[],[],[]
8,art_6,art_6__para_1,art_6__para_1__p_1,art_6__para_1__p_1,False,,False,,Il titolo originario dell'acquisto del diritto...,[],[],[],[]
9,art_7,art_7__para_1,art_7__para_1__p_1,art_7__para_1__p_1,False,,False,,E' considerato autore dell'opera collettiva ch...,[],[],[],[]


In [29]:
# Create a new dataframe with the rows where either intro or points is True. These are the rows that can be mapped to lists
pointed_list = df[(df['intro'] == True) | (df['point'] == True)]

# Save it to csv
pointed_list.to_csv('data/csv/pointed_list.csv', index=False)

#pointed_list.head(10)

In [13]:
# Show me the rows where the value of the paragraph column is repeated, and neither the intro nor the point column is True
df[(df['paragraph_id'].duplicated(keep=False)) & (df['intro'] != True) & (df['point'] != True)]

# Assign the value of a new column called update as true to the rows where the value of the paragraph column is repeated, and neither the intro nor the point column is True
df.loc[(df['paragraph_id'].duplicated(keep=False)) & (df['intro'] != True) & (df['point'] != True), 'update'] = True

# Assign the rest of the rows to False
df.loc[df['update'] != True, 'update'] = False

In [14]:
# Find the indices where 'p_id' and 'original_id' are different
indices = df[df['p_id'] != df['original_id']].index

# Get the rows before and after each difference
rows = []
for index in indices:
    #if index > 0:
    #rows.append(df.iloc[index - 1])  # Row before difference
    rows.append(df.iloc[index])  # Row with the difference
    #if index < len(df) - 1:
        #rows.append(df.iloc[index + 1])  # Row after difference

# Create a new DataFrame from the selected rows
result_df = pd.DataFrame(rows)
result_df.head(30)


Unnamed: 0,article_id,paragraph_id,p_id,original_id,intro,intro_id,point,point_id,text,insertions,references,ins_id,ref_id,update
17,art_13,art_13__para_1,art_13__para_1__p_1,art_13__para_2__p_1,False,,False,,Il diritto esclusivo di riprodurre ha per ogge...,[],[],[],[],False
26,art_15-bis,art_15-bis__para_2-bis,art_15-bis__para_2-bis__p_1,art_15-bis__para_3__p_1,False,,False,,Agli organizzatori di spettacoli dal vivo alle...,[],[],[],[],False
27,art_15-bis,art_15-bis__para_2-ter,art_15-bis__para_2-ter__p_1,art_15-bis__para_4__p_1,False,,False,,Con decreto del Ministro dei beni e delle atti...,[((e gli altri organismi di gestione collettiv...,[],"[ins_7, ins_8, ins_9]",[],False
37,art_16-ter,art_16-ter__para_1,art_16-ter__para_1__p_1,art_16-ter__para_2__p_1,True,,False,,"Ai fini della presente legge per ""ritrasmissio...",[],[],[],[],False
38,art_16-ter,art_16-ter__para_1,art_16-ter__para_1__p_2,art_16-ter__para_2__p_2,False,art_16-ter__para_1,True,art_16-ter__para_1.__point_a,e' effettuata da un soggetto diverso dall'orga...,[],[],[],[],False
39,art_16-ter,art_16-ter__para_1,art_16-ter__para_1__p_3,art_16-ter__para_2__p_3,False,art_16-ter__para_1,True,art_16-ter__para_1.__point_b,e' effettuata su un servizio di accesso a inte...,[],[],[],[],False
40,art_16-ter,art_16-ter__para_2,art_16-ter__para_2__p_1,art_16-ter__para_3__p_1,False,,False,,La ritrasmissione di programmi televisivi o ra...,[],[],[],[],False
41,art_16-ter,art_16-ter__para_3,art_16-ter__para_3__p_1,art_16-ter__para_4__p_1,False,,False,,I titolari del diritto d'autore e dei diritti ...,[],[],[],[],False
42,art_16-ter,art_16-ter__para_4,art_16-ter__para_4__p_1,art_16-ter__para_5__p_1,False,,False,,Gli organismi di gestione collettiva di cui al...,[],[articolo 8 del decreto legislativo 15 marzo 2...,[],[/akn/it/act/decretoLegislativo/stato/2017-03-...,False
43,art_16-ter,art_16-ter__para_5,art_16-ter__para_5__p_1,art_16-ter__para_6__p_1,False,,False,,Quando i titolari del diritto non hanno trasfe...,[],[],[],[],False


In [15]:

# Show me the rows that contain only (( or )) in the text, and have the paragraph_id finishing with para_1
df[(df['text'] == '))') & (df['paragraph_id'].str.endswith('para_1'))]


Unnamed: 0,article_id,paragraph_id,p_id,original_id,intro,intro_id,point,point_id,text,insertions,references,ins_id,ref_id,update


In [17]:
# Show me where p_id and original_id are different, and the rows before and after each difference


# From this is visible that the discrepancy are due either by the presence of (( or )) in the text ex: line 18, art. 13
# Other noise in the text (Aggiornamenti)
# Or by the presence of a new paragraph in the text, which can be semantically meaningful

# On the other side, at line 28, art_15-bis, we spot a mistake in the original paragraph_id, as it is marked as par. 2 and they should be 3. and 4.

# Art 16-ter are wrong as per case one, and this influences the list

# Save the list of different paragraph_id in a list
list_of_different_paragraph_id = df[(df['text'] == '((') & (df['paragraph_id'].str.endswith('para_1'))]['article_id'].tolist()

# Get the rows where the article_id is in the dataframe
df[df['article_id'].isin(list_of_different_paragraph_id)].to_csv("data/csv/ins_parenthesis", index=False)


test = df[df['article_id'].isin(list_of_different_paragraph_id)]

# filter parenthesis by having either point or intro as true
test[(test['intro'] == False) & (test['point'] == False)]

# Loop through the dataframe
#for i, row in test.iterrows()
    # if the paragraph_id ends with para_1, the content of the text column is ((
        # if the value of the column article_id is the same as the next row
        # Delete the row iterating and set para_1 






Unnamed: 0,article_id,paragraph_id,p_id,original_id,intro,intro_id,point,point_id,text,insertions,references,ins_id,ref_id,update


In [26]:
# Show the rows that contain only (( or )) in the text column and the two rows below

df[(df['text'] == '((') | (df['text'] == '))')].head(3)




#df[(df['text'] == '((') | (df['text'] == '))')]



Unnamed: 0,article_id,paragraph_id,p_id,intro,intro_id,point,point_id,xml_id,text,insertions,references,ins_id,ref_id,update
17,art_13,art_13__para_1,art_13__para_1__p_1,False,,False,,art_13__para_1__p_1,((,[],[],[],[],True
19,art_13,art_13__para_3,art_13__para_3__p_1,False,,False,,art_13__para_3__p_1,)),[],[],[],[],False
39,art_16-ter,art_16-ter__para_1,art_16-ter__para_1__p_1,False,,False,,art_16-ter__para_1__p_1,((,[],[],[],[],True


In [25]:
# Aggiornamenti are the pieces of text with a misaligned paragraph_id and xml_id at para level
# which are not intro nor point (so they have intro and point set to False, nor have a p element composed exclusively by (( or ))

#test = split_id(df, 'p_id')

# Get the rows where neither the intro or the point is True, these are the pieces of text that can be mapped to # AGGIORNAMENTO
aggiornamenti = df[df['update'] == True]  
aggiornamenti.to_csv('data/csv/aggiornamenti.csv', index=False)
aggiornamenti.head(30)





Unnamed: 0,article_id,paragraph_id,p_id,intro,intro_id,point,point_id,xml_id,text,insertions,references,ins_id,ref_id,update
2,art_2,art_2__para_2,art_2__para_2__p_1,False,,False,,art_2__para_2__p_1,-------------------,[],[],[],[],True
3,art_2,art_2__para_2,art_2__para_2__p_2,False,,False,,art_2__para_2__p_2,AGGIORNAMENTO (26),[],[],[],[],True
4,art_2,art_2__para_2,art_2__para_2__p_3,False,,False,,art_2__para_2__p_3,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",[],"[L. 12 dicembre 2002, n. 273, articolo 2, nume...",[],"[/akn/it/act/legge/stato/2002-12-12/273/!main,...",True
17,art_13,art_13__para_1,art_13__para_1__p_1,False,,False,,art_13__para_1__p_1,((,[],[],[],[],True
18,art_13,art_13__para_1,art_13__para_1__p_1,False,,False,,art_13__para_2__p_1,Il diritto esclusivo di riprodurre ha per ogge...,[],[],[],[],True
30,art_15-bis,art_15-bis__para_5,art_15-bis__para_5__p_1,False,,False,,art_15-bis__para_5__p_1,---------------,[],[],[],[],True
31,art_15-bis,art_15-bis__para_5,art_15-bis__para_5__p_2,False,,False,,art_15-bis__para_5__p_2,AGGIORNAMENTO (49),[],[],[],[],True
32,art_15-bis,art_15-bis__para_5,art_15-bis__para_5__p_3,False,,False,,art_15-bis__para_5__p_3,"Il D.L. 16 ottobre 2017, n. 148 , convertito c...",[],"[D.L. 16 ottobre 2017, n. 148, L. 4 dicembre 2...",[],[/akn/it/act/decretoLegge/stato/2017-10-16/148...,True
39,art_16-ter,art_16-ter__para_1,art_16-ter__para_1__p_1,False,,False,,art_16-ter__para_1__p_1,((,[],[],[],[],True
51,art_16-quater,art_16-quater__para_1,art_16-quater__para_1__p_1,False,,False,,art_16-quater__para_1__p_1,((,[],[],[],[],True


In [53]:
# Can I achieve a granularity at sentence level?

# Then assign the ontology at sentence level. with specific expressions at word level, as instances of that specific ontology.

In [54]:
#Merge the two dataframes, articles_text and articles_chapter, on the column eId, and save the result in a new dataframe called df_merged

df_merged = pd.merge(chapters, df, on='article_id')

#df_merged.head(10)

In [55]:
df.head(30)

Unnamed: 0,article_id,paragraph_id,p_id,intro,intro_id,point,point_id,xml_id,text,insertions,references,ins_id,ref_id
0,art_1,art_1__para_1,art_1__para_1__p_1,False,,False,,art_1__para_1__p_1,Sono protette ai sensi di questa legge le oper...,"[((, nonche' le banche di dati che per la scel...","[legge 20 giugno 1978, n. 399]",[ins_1],[/akn/it/act/legge/stato/1978-06-20/399/!main]
1,art_2,art_2__para_1,art_2__para_1__p_1,False,,False,,art_2__para_1__p_1,In particolare sono comprese nella protezione:...,[((26))],[],[ins_2],[]
2,art_2,art_2__para_2,art_2__para_2__p_1,False,,False,,art_2__para_2__p_1,-------------------,[],[],[],[]
3,art_2,art_2__para_2,art_2__para_2__p_2,False,,False,,art_2__para_2__p_2,AGGIORNAMENTO (26),[],[],[],[]
4,art_2,art_2__para_2,art_2__para_2__p_3,False,,False,,art_2__para_2__p_3,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",[],"[L. 12 dicembre 2002, n. 273, articolo 2, nume...",[],"[/akn/it/act/legge/stato/2002-12-12/273/!main,..."
5,art_3,art_3__para_1,art_3__para_1__p_1,False,,False,,art_3__para_1__p_1,"Le opere collettive, costituite dalla riunione...",[],[],[],[]
6,art_4,art_4__para_1,art_4__para_1__p_1,False,,False,,art_4__para_1__p_1,Senza pregiudizio dei diritti esistenti sull'o...,[],[],[],[]
7,art_5,art_5__para_1,art_5__para_1__p_1,False,,False,,art_5__para_1__p_1,Le disposizioni di questa legge non si applica...,[],[],[],[]
8,art_6,art_6__para_1,art_6__para_1__p_1,False,,False,,art_6__para_1__p_1,Il titolo originario dell'acquisto del diritto...,[],[],[],[]
9,art_7,art_7__para_1,art_7__para_1__p_1,False,,False,,art_7__para_1__p_1,E' considerato autore dell'opera collettiva ch...,[],[],[],[]
