In [None]:
from bs4 import BeautifulSoup

with open("gdpr/gdpr.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

## Parse the GDPR into a dictionary

In [9]:
import re

def parse_legal_texts():
    legal_texts = {}
    
    pattern = re.compile(r'art_\d+(?!\.tit_1)$')
    articles = soup.find_all("div", id=pattern)
        
    # GDPR
    gdpr_dict = {}
    for article in articles:
        article_dict = {}
        points = article.find_all("div", class_="norm")
        article_num = article['id'].split('_')[-1]
        
        for point in points:    # "norm" in the document
            has_sub_points = point.find("div") and point.find("div").find("p")
            if point.find("div"):
                point_num = point.find('span').text.split('.')[0]
            
            if has_sub_points:
                sub_point_dict = {}
                for sub_point in point.find("div").find_all("div", class_=['grid-container', 'grid-list']):
                    suffix = None
                    if sub_point.find("div", class_="grid-list-column-2").find("p"):
                        suffix = sub_point.find("div", class_="grid-list-column-2").find("p").text
                    elif sub_point.find("div", class_="grid-list-column-2").find("div"):
                        suffix = sub_point.find("div", class_="grid-list-column-2").find("div").text
                    sentence = (point.find("div").find("p").text + " " +
                                suffix)

                    sub_point_num = sub_point.find('div').find('span').text.strip()
                    if sentence:
                        sub_point_dict[f"Subpoint {sub_point_num}"] = sentence
                article_dict[f"Point {point_num}"] = sub_point_dict
                
            elif point.find("div"):
                sentence = point.find("div").text

                if sentence:
                    article_dict[f"Point {point_num}"] = sentence
        gdpr_dict[f"Article {article_num}"] = article_dict
    
    legal_texts["GDPR"] = gdpr_dict
    return legal_texts
legal_texts = parse_legal_texts()
legal_texts

{'GDPR': {'Article 1': {'Point 1': 'This Regulation lays down rules relating to the protection of natural persons with regard to the processing of personal data and rules relating to the free movement of personal data.',
   'Point 2': 'This Regulation protects fundamental rights and freedoms of natural persons and in particular their right to the protection of personal data.',
   'Point 3': 'The free movement of personal data within the Union shall be neither restricted nor prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data.'},
  'Article 2': {'Point 1': 'This Regulation applies to the processing of personal data wholly or partly by automated means and to the processing other than by automated means of personal data which form part of a filing system or are intended to form part of a filing system.',
   'Point 2': {'Subpoint (a)': 'This Regulation does not apply to the processing of personal data: in the course of an a

In [10]:
legal_texts["GDPR"]["Article 70"]["Point 1"]["Subpoint (b)"]

'The Board shall ensure the consistent application of this Regulation. To that end, the Board shall, on its own initiative or, where relevant, at the request of the Commission, in particular: advise the Commission on any issue related to the protection of personal data in the Union, including on any proposed amendment of this Regulation;'

## How to link different Articles

My idea for a quick prototype is to go through each sentence and find all the references to articles in this format: "Article X", "Article X(Y)" and "Article X(Y)(z)", probably using regular expressions. From this, I will make a simple link between the source and destination sentences / sections. 

This will allow me to refer to different layers of the act such as individual sentences, all the way up to entire Articles.

In [53]:
# Loop through the sentences

def find_links_in_legal_text():
    references = []
    
    # pattern = re.compile(r"Article \d+(\(\d+\))?([a-z])?")
    pattern = re.compile(r"Article \d+")

    for article_name, article_dict in legal_texts["GDPR"].items():
        for point_name, point_val in article_dict.items():
            if type(point_val) is dict:
                # It has subpoints
                for subpoint_name, sentence in point_val.items():
                    match = re.search(pattern, sentence)
                    if match:  # This only matches the first one, look into matching multiple Articles
                        references.append({"head": article_name, "tail": match.group()})
                        print(match.group())
                        print(sentence)
            else:
                # It doesn't have subpoints
                match = re.search(pattern, point_val)
                if match:  # This only matches the first one, look into matching multiple Articles
                    references.append({"head": article_name, "tail": match.group()})
                    print(match.group())
                    print(sentence)
    
    return references
references = find_links_in_legal_text()
references

Article 98
This Regulation does not apply to the processing of personal data: by competent authorities for the purposes of the prevention, investigation, detection or prosecution of criminal offences or the execution of criminal penalties, including the safeguarding against and the prevention of threats to public security.
Article 89
Personal data shall be: collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes; further processing for archiving purposes in the public interest, scientific or historical research purposes or statistical purposes shall, in accordance with Article 89(1), not be considered to be incompatible with the initial purposes (‘purpose limitation’);
Article 89
Personal data shall be: kept in a form which permits identification of data subjects for no longer than is necessary for the purposes for which the personal data are processed; personal data may be stored for longer periods insofa

[{'head': 'Article 2', 'tail': 'Article 98'},
 {'head': 'Article 5', 'tail': 'Article 89'},
 {'head': 'Article 5', 'tail': 'Article 89'},
 {'head': 'Article 6', 'tail': 'Article 23'},
 {'head': 'Article 6', 'tail': 'Article 23'},
 {'head': 'Article 6', 'tail': 'Article 23'},
 {'head': 'Article 6', 'tail': 'Article 23'},
 {'head': 'Article 6', 'tail': 'Article 23'},
 {'head': 'Article 8', 'tail': 'Article 6'},
 {'head': 'Article 9', 'tail': 'Article 89'},
 {'head': 'Article 12', 'tail': 'Article 11'},
 {'head': 'Article 12', 'tail': 'Article 11'},
 {'head': 'Article 12', 'tail': 'Article 92'},
 {'head': 'Article 13', 'tail': 'Article 6'},
 {'head': 'Article 13', 'tail': 'Article 46'},
 {'head': 'Article 13', 'tail': 'Article 6'},
 {'head': 'Article 13', 'tail': 'Article 22'},
 {'head': 'Article 14', 'tail': 'Article 46'},
 {'head': 'Article 14', 'tail': 'Article 6'},
 {'head': 'Article 14', 'tail': 'Article 6'},
 {'head': 'Article 14', 'tail': 'Article 22'},
 {'head': 'Article 14', 'tai

In [61]:
import networkx as nx
from pyvis.network import Network

G = nx.DiGraph()

# Iterate through data to add edges to the graph
for item in references:
    G.add_edge(item["head"], item["tail"], )

# Initialize PyVis network
net = Network(notebook=False, height="750px", width="100%")
net.from_nx(G)

for edge in net.edges:
    edge["arrows"] = "to"

# Customize the visualization
net.show_buttons(filter_=['physics'])
net.toggle_physics(True)

# Save the visualization to a html file
net.show("gdpr/gdpr_article_relations.html")