In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from bs4.element import Comment
from progressbar import progressbar
import requests
from pickle import dump

# Ontario Court of Appeal Decisions

In [None]:
onca_main = "https://www.ontariocourts.ca/coa/decisions_main/"
onca_year_pages = []
# Make a GET request to fetch the raw HTML content
html_content = requests.get(onca_main).text
soup = BeautifulSoup(html_content,"html.parser")
for link in soup.find_all('a'):
    result = link.get('href')
    if isinstance(result,str):
      if re.search("decisions-\d{4}",result):
        onca_year_pages.append(result)

onca_year_pages

['https://www.ontariocourts.ca/coa/decisions_main/decisions-2022/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2021/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2020/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2019/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2018/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2017/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2016/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2015/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2014/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2013/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2012/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2011/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2010/',
 'https://www.ontariocourts.ca/coa/decisions_main/decisions-2009/',
 'https://www.ontariocourts.ca/coa/decisions_mai

In [None]:
case_pages = []
for year_page in onca_year_pages:
  par = requests.get(year_page).text
  soup_s = BeautifulSoup(par,"html.parser")
  for link in soup_s.find_all('a'):
      result = link.get('href')
      if isinstance(result,str):
        if re.search("decisions/\d{4}/[A-Z0-9]+\.htm$",result):
          case_pages.append(result)

In [None]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u"\n".join(t.strip() for t in visible_texts)


In [None]:
case_dict = {}
for c in progressbar(case_pages):
  yr = re.findall("(?<=decisions/)\d{4}",c)[0]
  caseid = re.findall("(?<=\d{4}/).*(?=.htm)",c)[0]
  caset = requests.get(c).text
  casetext = text_from_html(caset)
  case_dict[caseid] = {'year':yr,'text':casetext,'jdx':'on'}

keys = [i for i in case_dict.keys()]
case_dict[keys[9]]['text']

100% (10443 of 10443) |##################| Elapsed Time: 0:45:07 Time:  0:45:07


'\n\n\n\nCOURT OF APPEAL FOR ONTARIO\n\nCITATION: Capital Sewer Servicing Inc. v.\r\n    Crosslinx Transit Solutions Constructors, 2022 ONCA 10\n\nDATE: 20220112\n\nDOCKET: C69165\n\nDoherty, Benotto and Huscroft\r\n    JJ.A.\n\nBETWEEN\n\nCapital Sewer Servicing Inc.\n\nRespondent/Counter-applicant (Appellant in\r\n    Appeal)\n\nand\n\nCrosslinx Transit Solutions\r\n    Constructors\n\nApplicant (Respondent in Appeal)\n\nIan M. Mair and Ramon V. Andal, for the\r\n    appellant\n\nDaniel A. Schwartz, Adrian Visheau, and\r\n    Alexander Soutter, for the respondent\n\nHeard: November 30, 2021\n\nOn appeal from the order of Justice Markus\r\n    Koehnen of the Superior Court of Justice, dated February 1, 2021, with reasons\r\n    reported at 2021 ONSC 1091.\n\nDoherty J.A.:\n\nOVERVIEW\n\n[1]\n\nThe appellant (Capital) and the respondent\r\n    (Crosslinx) were involved in the construction of a light rail transit line in Toronto.\r\n    They were both sued by property owners who alleged

# BCCA

In [None]:
bc_case_pages = []
bcca_pages = ["https://www.bccourts.ca/search_judgments.aspx?obd={}&court=1#SearchTitle".format(i) for i in np.arange(2003,2022)]
for year_page in bcca_pages:
  par = requests.get(year_page).text
  soup_s = BeautifulSoup(par,"html.parser")
  for link in soup_s.find_all('a'):
      result = link.get('href')
      if isinstance(result,str):
        if re.search("\d{4}bcca.+\.htm$|\d{4}BCCA.+\.htm$",result):
          bc_case_pages.append("https://www.bccourts.ca"+result)

In [None]:
for c in progressbar(bc_case_pages):
  yr = re.findall("\d{4}(?=bcca)|\d{4}(?=BCCA)",c)[0]
  caseid = re.findall("(?<=\d{2}/).*(?=.htm)",c)[0]
  caset = requests.get(c).text
  casetext = text_from_html(caset)
  case_dict[caseid] = {'year':yr,'text':casetext,'jdx':'bc'}

100% (927 of 927) |######################| Elapsed Time: 0:08:23 Time:  0:08:23


# SCC

In [None]:
#from time import sleep
#for c in progressbar(np.arange(2789,19299)):
#  curl = "https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{}/index.do?iframe=true".format(c)
#  caset = requests.get(curl).text
#  casetext = text_from_html(caset)
#  if re.search("Dear user,",casetext):
#    raise
#  case_dict[c] = {'text':casetext,'jdx':'scc'}
#  if c % 20 == 0:
#    time.sleep(5)
  

                                                                               N/A% (0 of 16510) |                      | Elapsed Time: 0:00:00 ETA:  --:--:--

RuntimeError: ignored

In [None]:
with open("/content/drive/MyDrive/" + "canada_cases.pickle", "wb") as f:
  dump(case_dict, f)
  f.close()



In [None]:
casetext

"\n\n\n\n\nDear user,\n\nYou are accessing a web site that utilizes Lexum's Decisia technology. Occasionally, the Decisia software calls upon users accessing a large number of files to ensure that they are people and not programs acting on command.\n\nIf you are not a computer, please proceed with the captcha test below by typing the letters and numbers displayed in the captcha image into the text box provided. Note that reading the audible captcha, which allows you to hear the numbers to be typed into the text box, requires Firefox, Chrome or Microsoft Edge. You can then resume your work where you left off.\n\nTo report any issues, please\ncontact us\n.\n\nHappy searching!\n\n\n\n\n\n\n\n\n\n\nResponse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"