In [1]:
from bs4 import BeautifulSoup

with open("gdpr/gdpr.html", "r", encoding="utf-8") as f:
    gdpr_soup = BeautifulSoup(f, "html.parser")


with open("dga/dga.html", "r", encoding="utf-8") as f:
    dga_soup = BeautifulSoup(f, "html.parser")

## Parse the GDPR into a dictionary

In [2]:
import re

def parse_legal_texts():
    legal_texts = {}
    
    pattern = re.compile(r'art_\d+(?!\.tit_1)$')
    gdpr_articles = gdpr_soup.find_all("div", id=pattern)
        
    # GDPR
    gdpr_dict = {}
    for article in gdpr_articles:
        article_dict = {}
        points = article.find_all("div", class_="norm")
        article_num = article['id'].split('_')[-1]

        for point in points:    # "norm" in the document
            has_sub_points = point.find("div") and point.find("div").find("p")
            if point.find("div"):
                point_num = point.find('span').text.split('.')[0]

            if has_sub_points:
                sub_point_dict = {}
                for sub_point in point.find("div").find_all("div", class_=['grid-container', 'grid-list']):
                    suffix = None
                    if sub_point.find("div", class_="grid-list-column-2").find("p"):
                        suffix = sub_point.find("div", class_="grid-list-column-2").find("p").text
                    elif sub_point.find("div", class_="grid-list-column-2").find("div"):
                        suffix = sub_point.find("div", class_="grid-list-column-2").find("div").text
                    sentence = (point.find("div").find("p").text + " " +
                                suffix)

                    sub_point_num = sub_point.find('div').find('span').text.strip()
                    if sentence:
                        sub_point_dict[f"Subpoint {sub_point_num}"] = sentence
                article_dict[f"Point {point_num}"] = sub_point_dict

            elif point.find("div"):
                sentence = point.find("div").text

                if sentence:
                    article_dict[f"Point {point_num}"] = sentence
        gdpr_dict[f"Article {article_num}"] = article_dict

    legal_texts["GDPR"] = gdpr_dict

    # Data Governance Act (DGA)
    dga_articles = dga_soup.find_all("div", id=pattern)

    dga_dict = {}
    
    dga_articles = dga_soup.find_all("div", id=pattern)

    for article in dga_articles:
        article_dict = {}
        points = article.find_all("div", id=re.compile(r'\d+\.\d+'))
        article_num = article['id'].split('_')[-1]

        for point in points:
            point_num = int(point['id'].split('.')[-1])
            point_content = point.find("p", class_="oj-normal")
            if point_content:
                point_text = re.sub(r'^\d+\.\s*', '', point_content.text).strip()
                article_dict[f"Point {point_num}"] = point_text

            sub_points = point.find_all("table")
            sub_point_dict = {}
            for sub_point in sub_points:
                sub_point_num = sub_point.find("p", class_="oj-normal").text.strip('() ')
                sub_point_content = sub_point.find("td", valign="top").find_next_sibling("td").find("p", class_="oj-normal")
                if sub_point_content:
                    sub_point_text = f"{point_text} {sub_point_content.text.strip()}"
                    sub_point_dict[f"Subpoint ({sub_point_num})"] = sub_point_text
            if sub_point_dict:
                article_dict[f"Point {point_num}"] = sub_point_dict

        dga_dict[f"Article {article_num}"] = article_dict

    legal_texts["DGA"] = dga_dict

    return legal_texts
legal_texts = parse_legal_texts()
legal_texts

{'GDPR': {'Article 1': {'Point 1': 'This Regulation lays down rules relating to the protection of natural persons with regard to the processing of personal data and rules relating to the free movement of personal data.',
   'Point 2': 'This Regulation protects fundamental rights and freedoms of natural persons and in particular their right to the protection of personal data.',
   'Point 3': 'The free movement of personal data within the Union shall be neither restricted nor prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data.'},
  'Article 2': {'Point 1': 'This Regulation applies to the processing of personal data wholly or partly by automated means and to the processing other than by automated means of personal data which form part of a filing system or are intended to form part of a filing system.',
   'Point 2': {'Subpoint (a)': 'This Regulation does not apply to the processing of personal data: in the course of an a

In [3]:
legal_texts["GDPR"]["Article 70"]["Point 1"]["Subpoint (b)"]

'The Board shall ensure the consistent application of this Regulation. To that end, the Board shall, on its own initiative or, where relevant, at the request of the Commission, in particular: advise the Commission on any issue related to the protection of personal data in the Union, including on any proposed amendment of this Regulation;'

In [4]:
legal_texts["DGA"]["Article 2"]


{}

In [5]:
legal_texts["GDPR"]["Article 5"]

{'Point 1': {'Subpoint (a)': 'Personal data shall be: processed lawfully, fairly and in a transparent manner in relation to the data subject (‘lawfulness, fairness and transparency’);',
  'Subpoint (b)': 'Personal data shall be: collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes; further processing for archiving purposes in the public interest, scientific or historical research purposes or statistical purposes shall, in accordance with Article 89(1), not be considered to be incompatible with the initial purposes (‘purpose limitation’);',
  'Subpoint (c)': 'Personal data shall be: adequate, relevant and limited to what is necessary in relation to the purposes for which they are processed (‘data minimisation’);',
  'Subpoint (d)': 'Personal data shall be: accurate and, where necessary, kept up to date; every reasonable step must be taken to ensure that personal data that are inaccurate, having regard to

## How to link different Articles

My idea for a quick prototype is to go through each sentence and find all the references to articles in this format: "Article X", "Article X(Y)" and "Article X(Y)(z)", probably using regular expressions. From this, I will make a simple link between the source and destination sentences / sections. 

This will allow me to refer to different layers of the act such as individual sentences, all the way up to entire Articles.

In [6]:
def find_internal_links_in_legal_text():
    """
    Look for all mentions of "Article X" within the GDPR.
    """
    references = []
    
    # pattern = re.compile(r"Article \d+(\(\d+\))?([a-z])?")
    pattern = re.compile(r"Article")

    for article_name, article_dict in legal_texts["GDPR"].items():
        for point_name, point_val in article_dict.items():
            if type(point_val) is dict:
                # It has subpoints
                for subpoint_name, sentence in point_val.items():
                    match = re.search(pattern, sentence)
                    if match:  # This only matches the first one, look into matching multiple Articles
                        references.append({"head": article_name, "tail": match.group()})
                        print(match.group())
                        print(sentence)
            else:
                # It doesn't have subpoints
                match = re.search(pattern, point_val)
                if match:  # This only matches the first one, look into matching multiple Articles
                    references.append({"head": article_name, "tail": match.group()})
                    print(match.group())
                    print(sentence)
    
    return references
references = find_internal_links_in_legal_text()
references

Article
This Regulation does not apply to the processing of personal data: by competent authorities for the purposes of the prevention, investigation, detection or prosecution of criminal offences or the execution of criminal penalties, including the safeguarding against and the prevention of threats to public security.
Article
This Regulation does not apply to the processing of personal data: by competent authorities for the purposes of the prevention, investigation, detection or prosecution of criminal offences or the execution of criminal penalties, including the safeguarding against and the prevention of threats to public security.
Article
Personal data shall be: collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes; further processing for archiving purposes in the public interest, scientific or historical research purposes or statistical purposes shall, in accordance with Article 89(1), not be consi

[{'head': 'Article 2', 'tail': 'Article'},
 {'head': 'Article 2', 'tail': 'Article'},
 {'head': 'Article 5', 'tail': 'Article'},
 {'head': 'Article 5', 'tail': 'Article'},
 {'head': 'Article 6', 'tail': 'Article'},
 {'head': 'Article 6', 'tail': 'Article'},
 {'head': 'Article 6', 'tail': 'Article'},
 {'head': 'Article 6', 'tail': 'Article'},
 {'head': 'Article 6', 'tail': 'Article'},
 {'head': 'Article 8', 'tail': 'Article'},
 {'head': 'Article 9', 'tail': 'Article'},
 {'head': 'Article 11', 'tail': 'Article'},
 {'head': 'Article 12', 'tail': 'Article'},
 {'head': 'Article 12', 'tail': 'Article'},
 {'head': 'Article 12', 'tail': 'Article'},
 {'head': 'Article 12', 'tail': 'Article'},
 {'head': 'Article 12', 'tail': 'Article'},
 {'head': 'Article 12', 'tail': 'Article'},
 {'head': 'Article 12', 'tail': 'Article'},
 {'head': 'Article 12', 'tail': 'Article'},
 {'head': 'Article 13', 'tail': 'Article'},
 {'head': 'Article 13', 'tail': 'Article'},
 {'head': 'Article 13', 'tail': 'Article'},

# Set up LLM

In [8]:
import os
from huggingface_hub import login 
import json
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_community.llms import HuggingFaceEndpoint

token = os.getenv('HF_TOKEN')
login(token=token)
model_name = "mistralai/mixtral-8x7b-instruct-v0.1"

with open("article_linking_few_shot_examples.json", "r") as file:
    few_shot_examples = json.load(file)


example_prompt = PromptTemplate(input_variables=["input", "output"], template="input: {input}\noutput:{output}")

example_prompt.format(**few_shot_examples[0])

prompt = FewShotPromptTemplate(
    examples=few_shot_examples,
    example_prompt=example_prompt,
    prefix= "Return a list consisting of all the articles mentioned in the input.",
    suffix="input: {input}\noutput",
    input_variables=["input"],
)

llm= HuggingFaceEndpoint(repo_id=model_name, max_new_tokens=512, temperature=0.05, repetition_penalty=1.25, huggingfacehub_api_token=token)
chain = prompt|llm

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/dylan/.cache/huggingface/token
Login successful
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/dylan/.cache/huggingface/token
Login successful


In [13]:
chain.invoke("The controller shall facilitate the exercise of data subject rights under Articles 15 to 22. 2In the cases referred to in Article 11(2), the controller shall not refuse to act on the request of the data subject for exercising his or her rights under Articles 15 to 22, unless the controller demonstrates that it is not in a position to identify the data subject.")

" = ['Article(15)', 'Article(16)', 'Article(17)', 'Article(18)', 'Article(19)', 'Article(20)', 'Article(21)' ,'Article(11)(2)']"

In [9]:
sample_sentence = """As  regards  the  subject  matter,  the  compromise  text  of  Article  1(1)  includes  a  high-level 
statement that one of the purposes of the AI Act  is  to  ensure a high level of protection of 
health, safety  and fundamental  rights  enshrined in the Charter, which includes democracy, 
rule of law and environmental protection. However, all subsequent references in the text to 
the risks  addressed by the Regulation  only include risks to  health, safety and fundamental 
rights, in line with the Council’s mandate. 
Concerning the scope, the compromise text makes it clear that national security is excluded. 
The wording concerning this exclusion in Article 2(3) has been fine tuned to align it more 
closely  with  the  respective  language  used  in  recently  agreed  legal  acts,  such  as  the  Cyber 
Resilience  Act  and  the  Data  Act,  while  the  text  of  the  corresponding  Recital  12a  has 
remained the same as in the Council’s mandate.  """
chain.invoke(sample_sentence)

': []'

In [8]:
def find_external_links_in_legal_text():
    """
    Look for all mentions of GDPR (Regulation (EU) 2016/679) in the DGA
    """
    references = []
    for article_name, article_dict in legal_texts["DGA"].items():
        for point_name, point_val in article_dict.items():
            if type(point_val) is dict:
                # It has subpoints
                for subpoint_name, sentence in point_val.items():
                    if "Regulation (EU) 2016/679" in sentence:
                        references.append({"head": "DGA: " + article_name + ", " + point_name, "tail": "GDPR"})
            else:
                # It doesn't have subpoints
                if "Regulation (EU) 2016/679" in point_val:
                    references.append({"head": "DGA: " + article_name + ", " + point_name, "tail": "GDPR"})
    return references

references = find_external_links_in_legal_text()
references

[{'head': 'DGA: Article 5, Point 6', 'tail': 'GDPR'},
 {'head': 'DGA: Article 9, Point 2', 'tail': 'GDPR'},
 {'head': 'DGA: Article 25, Point 3', 'tail': 'GDPR'}]

## Display the Knowledge Graph

In [9]:
import networkx as nx
from pyvis.network import Network

G = nx.DiGraph()

# Iterate through data to add edges to the graph
for item in references:
    G.add_edge(item["head"], item["tail"])

# Initialize PyVis network
net = Network(notebook=False, height="750px", width="100%")
net.from_nx(G)

for edge in net.edges:
    edge["arrows"] = "to"

# Customize the visualization
net.show_buttons(filter_=['physics'])
net.toggle_physics(True)

# Save the visualization to a html file
net.show("gdpr/gdpr_article_relations.html")

ModuleNotFoundError: No module named 'networkx'

In [None]:
# Make a KG of external references



## Create a Knowledge Graph for each article

I want it in the form
```python
relations = 
[
    {"Article 1": 
        [
            {"head": "This regulation", "relation": "protects", "tail": "natural persons"},
            ...
        ]
    "Article 2":
    ...
]
    
            

In [None]:
def find_relations():
    # TODO: Complete
    relations = []

    # pattern = re.compile(r"Article \d+(\(\d+\))?([a-z])?")
    pattern = re.compile(r"Article \d+")

    for article_name, article_dict in legal_texts["GDPR"].items():
        for point_name, point_val in article_dict.items():
            if type(point_val) is dict:
                # It has subpoints
                for subpoint_name, sentence in point_val.items():
                    match = re.search(pattern, sentence)
                    if match:  # This only matches the first one, look into matching multiple Articles
                        references.append({"head": article_name, "tail": match.group()})
                        print(match.group())
                        print(sentence)
            else:
                # It doesn't have subpoints
                match = re.search(pattern, point_val)
                if match:  # This only matches the first one, look into matching multiple Articles
                    references.append({"head": article_name, "tail": match.group()})
                    print(match.group())
                    print(sentence)

    return references
