In [2]:
import requests
import urllib
import time
import copy
import re
import json
from bs4 import BeautifulSoup

In [3]:
url = 'https://www.ontario.ca/laws/regulation/120332'
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, "html.parser")

<Response [200]>


In [4]:
items = soup.find_all('p', class_ = ['section-e', 'subsection-e', 'clause-e', 'subclause-e'])

In [5]:
def toJSON(title, obj):
    with open('Data/' + title + '.json', 'w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)
    return

In [6]:
def formatText(text): 
    
        text = str(text)
    
        cleaned = re.sub(r"<b>|</b>", "", text)
        cleaned = re.sub(r"<i>|</i>", "", cleaned)
        
        if (">" in cleaned):
              
            start = cleaned.index(">") + 1
            cleaned = cleaned[start:]
            
        if ("<" in cleaned):
        
            end = cleaned.index("<")
            cleaned = cleaned[:end]
        

        return cleaned

In [7]:
def label(list):
    
    labeled = []
    
    
    for item in list:
        
        if item.attrs['class'] == ['section-e']:
            
            cleaned = formatText(item)
            labeled.append({"type": "Section", "text": cleaned})
            
            
        elif item.attrs['class'] == ['subsection-e']:
            cleaned = formatText(item)
            labeled.append({"type": "Sub Section", "text": cleaned})
            
        elif item.attrs['class'] == ['clause-e']:
            cleaned = formatText(item)
            labeled.append({"type": "Clause", "text": cleaned})
            
        elif item.attrs['class'] == ['subclause-e']:
            cleaned = formatText(item)
            labeled.append({"type": "Sub Clause", "text": cleaned})
            
            
    return labeled

In [8]:
labeled = label(items)

In [9]:
def partitionPoints(list, type):
    
    points = []
    titles = []
    
    for item in list:
        
        if (item["type"] == type):
            points.append(list.index(item))
            titles.append(item)
        
    return [ points, titles ]

In [10]:
def partition(list, type):
    
    
    points = partitionPoints(list, type)[0]
    partitions = []
    
    for point in points:
        
        last_index = len(points) - 1
        current_index = points.index(point)
        
        if (last_index != current_index):
        
            start = point + 1
            end = points[current_index + 1]
            partition = list[start:end]
            
    
            
        else:
            
            start = point + 1
            partition = list[start:]
            
            
        partitions.append(partition)
    
    
    return partitions

In [11]:
def correlate(partitions, titles, partition_type):
    
    list = []

    
    length = len(partitions)
    
    for i in range(length):
              
            
        title = titles[i]
        partition = partitions[i]
    
        
        list.append({partition_type : title, "Items" : partition})
              
    return list

In [12]:
def filterByType(l, types):

    arr = list(filter(lambda i: i["type"] not in types, l))
    return arr

In [13]:
def cleanByType(l, t):
    
    clean = []
    
    for i in l:
        
        cleaned = filterByType(i, t)
        clean.append(cleaned)
        
    return clean

In [14]:
# Partition & Extract Sections

sections = partitionPoints(labeled, "Section")[1]
section_partitions = partition(labeled, "Section")
section_partitions = cleanByType(section_partitions, [])

In [15]:
# Partition & Extract Sub Sections

sub_sections = partitionPoints(labeled, "Sub Section")[1]
sub_section_partitions = partition(labeled, "Sub Section")
sub_section_partitions = cleanByType(sub_section_partitions, ["Section"])

In [16]:
# Paritition & Extract Clauses

clauses = partitionPoints(labeled, "Clause")[1]
clause_partitions = partition(labeled, "Clause")
clause_partitions = cleanByType(clause_partitions, ["Section", "Sub Section"])

In [17]:
# Organize Partitions -- Define Functions

In [18]:
def countItems(data):
    
    count = []
    
    for array in data:
        
        count.append(len(array))
            
    return count

In [19]:
def place(count, items):
    
    placements = []
    
    for amount in count:
                
            placement = items[0:amount]
            items = items[amount:]
            
            placements.append(placement)
            
            
    return placements 

In [20]:
# Organize Partitions -- Build Up From Deepest Elements : Clauses

In [21]:
# Define Sub Sections Placements

cleaned_sub_section_partitions = cleanByType(sub_section_partitions, ["Sub Clause"])

sub_section_placements = place(countItems(cleaned_sub_section_partitions), correlate(clause_partitions, clauses, "Clause"))

sub_sections_placed = correlate(sub_section_placements, sub_sections, "Sub Section")

In [22]:
# Define Section Placements

cleaned_section_partitions = cleanByType(section_partitions, ["Clause", "Sub Clause"])

section_placements = place(countItems(cleaned_section_partitions), correlate(sub_sections_placed, sub_sections, "Sub Section"))

sections_placed = correlate(section_placements, sections, "Section")

In [23]:
toJSON("sections", sections_placed)