In [4]:
import requests
import re
import urllib.request
import time
from bs4 import BeautifulSoup

In [5]:
# Step 1 : Get Request to url of data source 

url = 'https://www.ontario.ca/laws/regulation/120332'
response = requests.get(url)

In [6]:
# Step 2 : If Response 200, then Get Request was succesful

response

<Response [200]>

In [7]:
# Step 3 : Parse the html

soup = BeautifulSoup(response.text,'html.parser')

In [8]:
# Step 4 : Create List of all `Rule` <p> tags

rules = soup.findAll("p", {"class":"ruleb-e"})

In [9]:
# Step 5 : Create List of all `Section` <p> tags

sections = soup.findAll("p", {"class":"section-e"})

In [10]:
# Step 6 : Create List of all `Sub-Section` <p> tags

sub_sections = soup.findAll("p", {"class":"subsection-e"})

In [11]:
# Step 7 : Create List of all `Clause` <p> tags

clauses = soup.findAll("p", {"class":"clause-e"})

In [12]:
# Step 9 : Create List of all `Sub-Clause` <p> tags

sub_clauses = soup.findAll("p", {"class":"subclause-e"})

In [13]:
def defineSampleSpace(soup, first_rule):
    
    # Note : Need to Accomodate end point
    
    # ---
    
    # Extract all <p> tags
    
    original_sample_space = soup.findAll("p")
    
    # Find index of first `Rule` - 1.1.1
    
    start = original_sample_space.index(first_rule) - 1
    
    # Define end as original end of list
    
    end = len(original_sample_space) - 1
    
    # Define new Sample Space
    
    new_sample_space = original_sample_space[start:end]
    
    # Return new Sample Space
    
    return new_sample_space
    

In [14]:
# Step 10 : Create Sample Space

sample_space = defineSampleSpace(soup, rules[1])

# Example of first 5 items in Sample Space

sample_space[0:5]

[<p class="ruleb-e">Section 1.1.<b> </b>Organization and Application</p>,
 <p class="ruleb-e">1.1.1.<b> </b>Organization of this Code</p>,
 <p class="section-e"><b>1.1.1.1. Scope of Division A</b></p>,
 <p class="subsection-e"><b>  (1) </b>Division A contains compliance and application provisions and the <i>objectives </i>and <i>functional statements</i>
 of this Code.</p>,
 <p class="section-e"><b>1.1.1.2. Scope of Division B</b></p>]

In [19]:
def formatText(text): 
    
        text = str(text)
    
        cleaned = re.sub(r"<b>|</b>", "", text)
              
        start = cleaned.index(">") + 1
        cleaned = cleaned[start:]
        
        end = cleaned.index("<")
        cleaned = cleaned[:end]
        
        
        return cleaned

In [20]:
def retrieveIndicies(sub_set, sample_space):
    indicies = []
    
    for item in sub_set:
        
        index = sample_space.index(item)
        text = formatText(item)
        indicies.append({"item": text, "index": index})
        
        
    return indicies

In [21]:
rules_indicies = retrieveIndicies(rules, sample_space)

In [18]:
rules_indicies[1]

{'item': '1.1.1.', 'index': 1}

In [18]:
def partitionSampleSpaceByIndicies(indicies, sample_space):
    
    partitionIndicies = []
    
    for item in indicies: 
        
        if indicies.index(item) == len(indicies) - 1:
            break
            
        else:
            
            # NOTE : OFF BY 1 ERROR ?
            
            start = item["index"]

            next_item = indicies[indicies.index(item) + 1]

            end = next_item["index"]

            partitionIndicies.append({"item": item["item"], "starting_index" : start, "ending_index": end})

        
    return partitionIndicies

In [19]:
rules_partition = partitionSampleSpaceByIndicies(rules_indicies, sample_space)

In [22]:
rules_partition[11]

{'item': '1.4.2. Symbols and Other Abbreviations',
 'starting_index': 716,
 'ending_index': 942}

In [25]:
rules_partition[1]

{'item': '1.1.1. Organization of this Code',
 'starting_index': 1,
 'ending_index': 10}

In [26]:
sample_space[2]

<p class="section-e"><b>1.1.1.1. Scope of Division A</b></p>

In [27]:
sample_space[3]

<p class="subsection-e"><b>  (1) </b>Division A contains compliance and application provisions and the <i>objectives </i>and <i>functional statements</i>
of this Code.</p>

In [29]:
def partitionRules(rules_partition, sample_space):
    
    partition = []
    
    for item in rules_partition:
        
        start = item["starting_index"]
        end = item["ending_index"]
        elements = sample_space[start:end]
        
        rule = { "rule" : item["item"], "elements": elements }
        
        partition.append(rule)
        
        
    return partition

In [31]:
partition_by_rules = partitionRules(rules_partition, sample_space)

In [44]:
partition_by_rules[1]

{'rule': '1.1.1. Organization of this Code',
 'elements': [<p class="ruleb-e">1.1.1.<b> </b>Organization of this Code</p>,
  <p class="section-e"><b>1.1.1.1. Scope of Division A</b></p>,
  <p class="subsection-e"><b>  (1) </b>Division A contains compliance and application provisions and the <i>objectives </i>and <i>functional statements</i>
  of this Code.</p>,
  <p class="section-e"><b>1.1.1.2. Scope of Division B</b></p>,
  <p class="subsection-e"><b>  (1) </b>Division B contains the <i>acceptable solutions</i>
  of this Code.</p>,
  <p class="section-e"><b>1.1.1.3. Scope of Division C</b></p>,
  <p class="subsection-e"><b>  (1) </b>Division C contains the administrative provisions of this Code.</p>,
  <p class="section-e"><b>1.1.1.4. Internal Cross-references</b></p>,
  <p class="subsection-e"><b>  (1) </b>If a provision of this Code contains a reference to another provision of this Code but no Division is specified, both provisions are in the same Division of this Code.</p>]}

In [93]:
def match(expression, list, type): 
    
    
        matches = []
    
        for item in list:
            
            text = str(item)
            match = re.search(expression, text)
            
            if match:
                
                index = list.index(item)
                cleaned = formatText(text)
                
                matches.append({"type": type, "text": cleaned, "index": index})
                
                
        return matches

In [96]:
partition_by_rules[2]

{'rule': '1.1.2. Application of Division B',
 'elements': [<p class="ruleb-e">1.1.2.<b> </b>Application of Division B</p>,
  <p class="section-e"><b>1.1.2.1. Application of Parts 1, 7 and 12</b></p>,
  <p class="subsection-e"> <b>(1)</b> Part 1 of Division B applies to all <i>buildings</i>.</p>,
  <p class="subsection-e"> <b>(2)</b> Subject to Article 1.1.2.6., Parts 7 and 12 of Division B apply to all buildings.</p>,
  <p class="section-e"><b>1.1.2.2. Application of Parts 3, 4, 5 and 6</b></p>,
  <p class="subsection-e"> <b>(1) </b>Subject to Articles 1.1.2.6. and 1.3.1.2., Parts 3, 5 and 6 of Division B apply to all <i>buildings</i>,</p>,
  <p class="clause-e">  (a)  used for <i>major occupancies</i> classified as,</p>,
  <p class="subclause-e">  (i)  Group A, <i>assembly occupancies</i>,</p>,
  <p class="subclause-e">  (ii)  Group B, <i>care</i>,<i> care and treatment</i> or<i> detention occupancies</i>, or</p>,
  <p class="subclause-e">  (iii)  Group F, Division 1, <i>high hazard i

In [99]:
match('"section-e"', partition_by_rules[2]["elements"], "Section")

[{'type': 'Section',
  'text': '1.1.2.1. Application of Parts 1, 7 and 12',
  'index': 1},
 {'type': 'Section',
  'text': '1.1.2.2. Application of Parts 3, 4, 5 and 6',
  'index': 4},
 {'type': 'Section', 'text': '1.1.2.3. Application of Part 8', 'index': 33},
 {'type': 'Section', 'text': '1.1.2.4. Application of Part 9', 'index': 35},
 {'type': 'Section', 'text': '1.1.2.5. Application of Part 10', 'index': 44},
 {'type': 'Section', 'text': '1.1.2.6. Application of Part 11', 'index': 46},
 {'type': 'Section', 'text': '1.1.2.7. Existing Buildings', 'index': 49}]

In [100]:
match('"subsection-e"', partition_by_rules[2]["elements"], "Sub Section")

[{'type': 'Sub Section',
  'text': ' (1) Part 1 of Division B applies to all ',
  'index': 2},
 {'type': 'Sub Section',
  'text': ' (2) Subject to Article 1.1.2.6., Parts 7 and 12 of Division B apply to all buildings.',
  'index': 3},
 {'type': 'Sub Section',
  'text': ' (1) Subject to Articles 1.1.2.6. and 1.3.1.2., Parts 3, 5 and 6 of Division B apply to all ',
  'index': 5},
 {'type': 'Sub Section',
  'text': '  (2) Subject to Articles 1.1.2.6. and 1.3.1.2., Part 4 of Division B applies to,',
  'index': 16},
 {'type': 'Sub Section',
  'text': '  (3) Section 3.11. of Division B applies to ',
  'index': 30},
 {'type': 'Sub Section',
  'text': '  (4) Section 3.12. of Division B applies to ',
  'index': 31},
 {'type': 'Sub Section',
  'text': '  (5) Section 3.15. of Division B applies to signs.',
  'index': 32},
 {'type': 'Sub Section',
  'text': ' (1) Subject to Article 1.1.2.6., Part 8 of Division B applies to the design, ',
  'index': 34},
 {'type': 'Sub Section',
  'text': ' (1) Sub