In [3]:
import requests
import urllib
import time
import copy
import re
import json
from bs4 import BeautifulSoup

In [4]:
url = 'https://www.ontario.ca/laws/regulation/120332'
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, "html.parser")

<Response [200]>


In [10]:
items = soup.find_all('p', class_ = ['section-e', 'subsection-e', 'clause-e', 'subclause-e'], limit = 30)

In [67]:
def toJSON(title, obj):
    with open(title + '.json', 'w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)
    return

In [6]:
def formatText(text): 
    
        text = str(text)
    
        cleaned = re.sub(r"<b>|</b>", "", text)
        cleaned = re.sub(r"<i>|</i>", "", cleaned)
        
        if (">" in cleaned):
              
            start = cleaned.index(">") + 1
            cleaned = cleaned[start:]
            
        if ("<" in cleaned):
        
            end = cleaned.index("<")
            cleaned = cleaned[:end]
        

        return cleaned

In [7]:
def label(list):
    
    labeled = []
    
    
    for item in list:
        
        if item.attrs['class'] == ['section-e']:
            
            cleaned = formatText(item)
            labeled.append({"type": "Section", "text": cleaned})
            
            
        elif item.attrs['class'] == ['subsection-e']:
            cleaned = formatText(item)
            labeled.append({"type": "Sub Section", "text": cleaned})
            
        elif item.attrs['class'] == ['clause-e']:
            cleaned = formatText(item)
            labeled.append({"type": "Clause", "text": cleaned})
            
        elif item.attrs['class'] == ['subclause-e']:
            cleaned = formatText(item)
            labeled.append({"type": "Sub Clause", "text": cleaned})
            
            
    return labeled

In [11]:
labeled = label(items)

In [13]:
def partitionPoints(list, type):
    
    points = []
    titles = []
    
    for item in list:
        
        if (item["type"] == type):
            points.append(list.index(item))
            titles.append(item)
        
    return [ points, titles ]

In [14]:
def partition(list, type):
    
    
    points = partitionPoints(list, type)[0]
    partitions = []
    
    for point in points:
        
        last_index = len(points) - 1
        current_index = points.index(point)
        
        if (last_index != current_index):
        
            start = point + 1
            end = points[current_index + 1]
            partition = list[start:end]
            
    
            
        else:
            
            start = point + 1
            partition = list[start:]
            
            
        partitions.append(partition)
    
    
    return partitions

In [57]:
def correlate(partitions, titles, partition_type):
    
    list = []

    
    length = len(partitions)
    
    for i in range(length):
              
            
        title = titles[i]
        partition = partitions[i]
    
        
        list.append({partition_type : title, "Items" : partition})
              
    return list

In [36]:
def filterByType(l, types):

    arr = list(filter(lambda i: i["type"] not in types, l))
    return arr

In [37]:
def cleanByType(l, t):
    
    clean = []
    
    for i in l:
        
        cleaned = filterByType(i, t)
        clean.append(cleaned)
        
    return clean

In [90]:
sub_sections = partitionPoints(labeled, "Sub Section")[1]
sub_section_partitions = partition(labeled, "Sub Section")
sub_section_partitions = cleanByType(sub_section_partitions, ["Section"])

In [88]:
sub_sections

[{'type': 'Sub Section',
  'text': '  (1) Division A contains compliance and application provisions and the objectives and functional statements\r\nof this Code.'},
 {'type': 'Sub Section',
  'text': '  (1) Division B contains the acceptable solutions\r\nof this Code.'},
 {'type': 'Sub Section',
  'text': '  (1) Division C contains the administrative provisions of this Code.'},
 {'type': 'Sub Section',
  'text': '  (1) If a provision of this Code contains a reference to another provision of this Code but no Division is specified, both provisions are in the same Division of this Code.'},
 {'type': 'Sub Section',
  'text': ' (1) Part 1 of Division B applies to all buildings.'},
 {'type': 'Sub Section',
  'text': ' (2) Subject to Article 1.1.2.6., Parts 7 and 12 of Division B apply to all buildings.'},
 {'type': 'Sub Section',
  'text': ' (1) Subject to Articles 1.1.2.6. and 1.3.1.2., Parts 3, 5 and 6 of Division B apply to all buildings,'},
 {'type': 'Sub Section',
  'text': '  (2) Subje

In [82]:
def countItems(data):
    
    count = []
    
    for array in data:
        
        count.append(len(array))
            
    return count

In [84]:
def filterItems(data, types):
    
    cleaned = []
    
    for array in data:
        
        clean = cleanByType(array, types)
        cleaned.append(clean)
        
    return cleaned    

In [94]:
k = cleanByType(sub_section_partitions, ["Sub Clause"])

In [95]:
k

[[],
 [],
 [],
 [],
 [],
 [],
 [{'type': 'Clause',
   'text': '  (a) used for major occupancies classified as,'},
  {'type': 'Clause', 'text': '  (b) exceeding 600 m'},
  {'type': 'Clause', 'text': '  (c) used for retirement homes.'}],
 [{'type': 'Clause', 'text': '  (a)  post-disaster buildings,'},
  {'type': 'Clause', 'text': '  (b)  buildings\r\ndescribed in Sentence (1),'},
  {'type': 'Clause',
   'text': '  (c) a retaining wall exceeding 1 000 mm in exposed height adjacent to,'}]]

In [96]:
countItems(k)

[0, 0, 0, 0, 0, 0, 3, 3]

In [103]:
def place(count, items):
    
    placements = []
    
    for amount in count:
                
            placement = items[0:amount]
            items = items[amount:]
            
            placements.append(placement)
            
            
    return placements        

In [118]:
x = place(countItems(k), correlate(clause_partitions, clauses, "Clause"))
correlate(x, sub_sections, "Sub Section")

[{'Sub Section': {'type': 'Sub Section',
   'text': '  (1) Division A contains compliance and application provisions and the objectives and functional statements\r\nof this Code.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': '  (1) Division B contains the acceptable solutions\r\nof this Code.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': '  (1) Division C contains the administrative provisions of this Code.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': '  (1) If a provision of this Code contains a reference to another provision of this Code but no Division is specified, both provisions are in the same Division of this Code.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': ' (1) Part 1 of Division B applies to all buildings.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': ' (2) Subject to Article 1.1.2.6., Parts 7 and 12 of Division B apply to all buildings.'},
  'Item

In [104]:
place([0,1,2],["one", "two"])

[[], ['one'], ['two', 'three']]

In [91]:
sub_section_partitions

[[],
 [],
 [],
 [],
 [],
 [],
 [{'type': 'Clause',
   'text': '  (a) used for major occupancies classified as,'},
  {'type': 'Sub Clause', 'text': '  (i) Group A, assembly occupancies,'},
  {'type': 'Sub Clause',
   'text': '  (ii) Group B, care, care and treatment or detention occupancies, or'},
  {'type': 'Sub Clause',
   'text': '  (iii) Group F, Division 1, high hazard industrial occupancies,'},
  {'type': 'Clause', 'text': '  (b) exceeding 600 m'},
  {'type': 'Sub Clause', 'text': '  (i) Group C, residential occupancies,'},
  {'type': 'Sub Clause',
   'text': '  (ii) Group D, business and personal services occupancies,'},
  {'type': 'Sub Clause', 'text': '  (iii) Group E, mercantile occupancies,'},
  {'type': 'Sub Clause',
   'text': '  (iv) Group F, Divisions 2 and 3, medium hazard industrial occupancies and low hazard industrial occupancies, or'},
  {'type': 'Clause', 'text': '  (c) used for retirement homes.'}],
 [{'type': 'Clause', 'text': '  (a)  post-disaster buildings,'},
 

In [58]:
correlate(sub_section_partitions, sub_sections, "Sub Section")

[{'Sub Section': {'type': 'Sub Section',
   'text': '  (1) Division A contains compliance and application provisions and the objectives and functional statements\r\nof this Code.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': '  (1) Division B contains the acceptable solutions\r\nof this Code.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': '  (1) Division C contains the administrative provisions of this Code.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': '  (1) If a provision of this Code contains a reference to another provision of this Code but no Division is specified, both provisions are in the same Division of this Code.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': ' (1) Part 1 of Division B applies to all buildings.'},
  'Items': []},
 {'Sub Section': {'type': 'Sub Section',
   'text': ' (2) Subject to Article 1.1.2.6., Parts 7 and 12 of Division B apply to all buildings.'},
  'Item

In [61]:
clauses = partitionPoints(labeled, "Clause")[1]
clause_partitions = partition(labeled, "Clause")
clause_partitions = cleanByType(clause_partitions, ["Section", "Sub Section"])

In [42]:
clause_partitions

[[{'type': 'Sub Clause', 'text': '  (i) Group A, assembly occupancies,'},
  {'type': 'Sub Clause',
   'text': '  (ii) Group B, care, care and treatment or detention occupancies, or'},
  {'type': 'Sub Clause',
   'text': '  (iii) Group F, Division 1, high hazard industrial occupancies,'}],
 [{'type': 'Sub Clause', 'text': '  (i) Group C, residential occupancies,'},
  {'type': 'Sub Clause',
   'text': '  (ii) Group D, business and personal services occupancies,'},
  {'type': 'Sub Clause', 'text': '  (iii) Group E, mercantile occupancies,'},
  {'type': 'Sub Clause',
   'text': '  (iv) Group F, Divisions 2 and 3, medium hazard industrial occupancies and low hazard industrial occupancies, or'}],
 [],
 [],
 [],
 [{'type': 'Sub Clause', 'text': '  (i) public property,'},
  {'type': 'Sub Clause', 'text': '  (ii) access to a building, or'},
  {'type': 'Sub Clause',
   'text': '  (iii) private property to which the public is admitted,'}]]

In [62]:
clauses

[{'type': 'Clause', 'text': '  (a) used for major occupancies classified as,'},
 {'type': 'Clause', 'text': '  (b) exceeding 600 m'},
 {'type': 'Clause', 'text': '  (c) used for retirement homes.'},
 {'type': 'Clause', 'text': '  (a)  post-disaster buildings,'},
 {'type': 'Clause', 'text': '  (b)  buildings\r\ndescribed in Sentence (1),'},
 {'type': 'Clause',
  'text': '  (c) a retaining wall exceeding 1 000 mm in exposed height adjacent to,'}]

In [119]:
toJSON("clauses", correlate(clause_partitions, clauses, "Clause"))
toJSON("sub_sections", correlate(x, sub_sections, "Sub Section"))

In [73]:
correlate(clause_partitions, clauses, "Clause")

[{'Clause': {'type': 'Clause',
   'text': '  (a) used for major occupancies classified as,'},
  'Items': [{'type': 'Sub Clause',
    'text': '  (i) Group A, assembly occupancies,'},
   {'type': 'Sub Clause',
    'text': '  (ii) Group B, care, care and treatment or detention occupancies, or'},
   {'type': 'Sub Clause',
    'text': '  (iii) Group F, Division 1, high hazard industrial occupancies,'}]},
 {'Clause': {'type': 'Clause', 'text': '  (b) exceeding 600 m'},
  'Items': [{'type': 'Sub Clause',
    'text': '  (i) Group C, residential occupancies,'},
   {'type': 'Sub Clause',
    'text': '  (ii) Group D, business and personal services occupancies,'},
   {'type': 'Sub Clause', 'text': '  (iii) Group E, mercantile occupancies,'},
   {'type': 'Sub Clause',
    'text': '  (iv) Group F, Divisions 2 and 3, medium hazard industrial occupancies and low hazard industrial occupancies, or'}]},
 {'Clause': {'type': 'Clause', 'text': '  (c) used for retirement homes.'},
  'Items': []},
 {'Clause'

In [111]:
toJSON("sub_section_items", place(countItems(k), correlate(clause_partitions, clauses, "Clause")))