In [14]:
# Load required xml libraries and the file called 19410716_041U0633_VIGENZA_20220922.xml
# The file is in the akoma ntoso format
import xml.etree.ElementTree as ET
import pandas as pd
from json import dumps
from akn_to_owl.parser import extract_articles, extract_chapters, transform_intro_points
from akn_to_owl.parser import process_articles, fill_column


In [15]:
root = ET.parse('data/akn/19410716_041U0633_VIGENZA_20220922.xml').getroot()

In [16]:
chapters = extract_chapters(root)
chapters.to_csv('data/csv/chapters.csv', index=False)
#chapters.head()

In [17]:
articles = extract_articles(root)

In [18]:
paragraph_list = process_articles(articles)

df = pd.DataFrame(paragraph_list)


In [19]:
# Save the colummns of the dataframe paragraph_id and p_id in a new dataframe
akoma_ntoso_key = df[['paragraph_id', 'p_id']]

# Save the dataframe as a csv file
akoma_ntoso_key.to_csv('data/csv/akoma_ntoso_key.csv', index=False)


In [20]:
# Fixes the misalignment between the paragraph_id and the p_id columns, ## TODO: fix the function name and content
df = fill_column(df, 'paragraph_id', 'p_id', "test_id")

# Rename the column p_id to xml_id and the column test_id to p_id
df = df.rename(columns={'p_id': 'xml_id', 'test_id': 'p_id'})

# reorder the columns of the dataframe
df = df[['article_id', 'paragraph_id', 'p_id', 'intro_id', 'point_id', 'xml_id', 'text', 'insertions', 'references', 'ins_id', 'ref_id']]

In [21]:
# Trnasforms the bullet lists in a machine-readable format, coherent with the rest of the database schema
df = transform_intro_points(df)

In [22]:
# Create a new dataframe with the rows where either intro or points is True. These are the rows that can be mapped to lists
pointed_list = df[(df['intro'] == True) | (df['point'] == True)]

# Save it to csv
pointed_list.to_csv('data/csv/pointed_list.csv', index=False)

#pointed_list.head(10)

In [23]:
# Show me the rows where the value of the paragraph column is repeated, and neither the intro nor the point column is True
df[(df['paragraph_id'].duplicated(keep=False)) & (df['intro'] != True) & (df['point'] != True)]

# Assign the value of a new column called update as true to the rows where the value of the paragraph column is repeated, and neither the intro nor the point column is True
df.loc[(df['paragraph_id'].duplicated(keep=False)) & (df['intro'] != True) & (df['point'] != True), 'update'] = True

# Assign the rest of the rows to False
df.loc[df['update'] != True, 'update'] = False

In [24]:
# Show the rows that contain only (( or )) in the text column
df[(df['text'] == '((') | (df['text'] == '))')]

Unnamed: 0,article_id,paragraph_id,p_id,intro,intro_id,point,point_id,xml_id,text,insertions,references,ins_id,ref_id,update
17,art_13,art_13__para_1,art_13__para_1__p_1,False,,False,,art_13__para_1__p_1,((,[],[],[],[],True
19,art_13,art_13__para_3,art_13__para_3__p_1,False,,False,,art_13__para_3__p_1,)),[],[],[],[],False
39,art_16-ter,art_16-ter__para_1,art_16-ter__para_1__p_1,False,,False,,art_16-ter__para_1__p_1,((,[],[],[],[],True
50,art_16-ter,art_16-ter__para_10,art_16-ter__para_10__p_1,False,,False,,art_16-ter__para_10__p_1,)),[],[],[],[],False
51,art_16-quater,art_16-quater__para_1,art_16-quater__para_1__p_1,False,,False,,art_16-quater__para_1__p_1,((,[],[],[],[],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,art_182-ter,art_182-ter__para_3,art_182-ter__para_3__p_1,False,,False,,art_182-ter__para_3__p_1,)),[],[],[],[],False
1172,art_194-bis,art_194-bis__para_1,art_194-bis__para_1__p_1,False,,False,,art_194-bis__para_1__p_1,((,[],[],[],[],True
1183,art_194-bis,art_194-bis__para_10,art_194-bis__para_10__p_1,False,,False,,art_194-bis__para_10__p_1,)),[],[],[],[],False
1198,art_199-bis,art_199-bis__para_1,art_199-bis__para_1__p_1,False,,False,,art_199-bis__para_1__p_1,((,[],[],[],[],True


In [25]:
# Aggiornamenti are the pieces of text with a misaligned paragraph_id and xml_id at para level
# which are not intro nor point (so they have intro and point set to False, nor have a p element composed exclusively by (( or ))

#test = split_id(df, 'p_id')

# Get the rows where neither the intro or the point is True, these are the pieces of text that can be mapped to # AGGIORNAMENTO
aggiornamenti = df[df['update'] == True]  
aggiornamenti.to_csv('data/csv/aggiornamenti.csv', index=False)
aggiornamenti.head(30)





Unnamed: 0,article_id,paragraph_id,p_id,intro,intro_id,point,point_id,xml_id,text,insertions,references,ins_id,ref_id,update
2,art_2,art_2__para_2,art_2__para_2__p_1,False,,False,,art_2__para_2__p_1,-------------------,[],[],[],[],True
3,art_2,art_2__para_2,art_2__para_2__p_2,False,,False,,art_2__para_2__p_2,AGGIORNAMENTO (26),[],[],[],[],True
4,art_2,art_2__para_2,art_2__para_2__p_3,False,,False,,art_2__para_2__p_3,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",[],"[L. 12 dicembre 2002, n. 273, articolo 2, nume...",[],"[/akn/it/act/legge/stato/2002-12-12/273/!main,...",True
17,art_13,art_13__para_1,art_13__para_1__p_1,False,,False,,art_13__para_1__p_1,((,[],[],[],[],True
18,art_13,art_13__para_1,art_13__para_1__p_1,False,,False,,art_13__para_2__p_1,Il diritto esclusivo di riprodurre ha per ogge...,[],[],[],[],True
30,art_15-bis,art_15-bis__para_5,art_15-bis__para_5__p_1,False,,False,,art_15-bis__para_5__p_1,---------------,[],[],[],[],True
31,art_15-bis,art_15-bis__para_5,art_15-bis__para_5__p_2,False,,False,,art_15-bis__para_5__p_2,AGGIORNAMENTO (49),[],[],[],[],True
32,art_15-bis,art_15-bis__para_5,art_15-bis__para_5__p_3,False,,False,,art_15-bis__para_5__p_3,"Il D.L. 16 ottobre 2017, n. 148 , convertito c...",[],"[D.L. 16 ottobre 2017, n. 148, L. 4 dicembre 2...",[],[/akn/it/act/decretoLegge/stato/2017-10-16/148...,True
39,art_16-ter,art_16-ter__para_1,art_16-ter__para_1__p_1,False,,False,,art_16-ter__para_1__p_1,((,[],[],[],[],True
51,art_16-quater,art_16-quater__para_1,art_16-quater__para_1__p_1,False,,False,,art_16-quater__para_1__p_1,((,[],[],[],[],True


In [53]:
# Can I achieve a granularity at sentence level?

# Then assign the ontology at sentence level. with specific expressions at word level, as instances of that specific ontology.

In [54]:
#Merge the two dataframes, articles_text and articles_chapter, on the column eId, and save the result in a new dataframe called df_merged

df_merged = pd.merge(chapters, df, on='article_id')

#df_merged.head(10)

In [55]:
df.head(30)

Unnamed: 0,article_id,paragraph_id,p_id,intro,intro_id,point,point_id,xml_id,text,insertions,references,ins_id,ref_id
0,art_1,art_1__para_1,art_1__para_1__p_1,False,,False,,art_1__para_1__p_1,Sono protette ai sensi di questa legge le oper...,"[((, nonche' le banche di dati che per la scel...","[legge 20 giugno 1978, n. 399]",[ins_1],[/akn/it/act/legge/stato/1978-06-20/399/!main]
1,art_2,art_2__para_1,art_2__para_1__p_1,False,,False,,art_2__para_1__p_1,In particolare sono comprese nella protezione:...,[((26))],[],[ins_2],[]
2,art_2,art_2__para_2,art_2__para_2__p_1,False,,False,,art_2__para_2__p_1,-------------------,[],[],[],[]
3,art_2,art_2__para_2,art_2__para_2__p_2,False,,False,,art_2__para_2__p_2,AGGIORNAMENTO (26),[],[],[],[]
4,art_2,art_2__para_2,art_2__para_2__p_3,False,,False,,art_2__para_2__p_3,"La L. 12 dicembre 2002, n. 273 , ha disposto (...",[],"[L. 12 dicembre 2002, n. 273, articolo 2, nume...",[],"[/akn/it/act/legge/stato/2002-12-12/273/!main,..."
5,art_3,art_3__para_1,art_3__para_1__p_1,False,,False,,art_3__para_1__p_1,"Le opere collettive, costituite dalla riunione...",[],[],[],[]
6,art_4,art_4__para_1,art_4__para_1__p_1,False,,False,,art_4__para_1__p_1,Senza pregiudizio dei diritti esistenti sull'o...,[],[],[],[]
7,art_5,art_5__para_1,art_5__para_1__p_1,False,,False,,art_5__para_1__p_1,Le disposizioni di questa legge non si applica...,[],[],[],[]
8,art_6,art_6__para_1,art_6__para_1__p_1,False,,False,,art_6__para_1__p_1,Il titolo originario dell'acquisto del diritto...,[],[],[],[]
9,art_7,art_7__para_1,art_7__para_1__p_1,False,,False,,art_7__para_1__p_1,E' considerato autore dell'opera collettiva ch...,[],[],[],[]
