# NLP workflow (from Natural Language Processing Fundamentals)
* Data collection
* Data preprocessing
* Feature extraction
* Model development
* Model assessment
* Model deployment

## Data collection
Because CanLII blocks web scraping with captchas and because high-volume web scraping violates CanLII's ToS, this program will have to rely on manually downloaded HTML pages for now. The ToS suggest that individuals may be able to secure mass-downloading rights, so I will look into this as the program develops.

The HTML files listed are copies of all reported criminal (and some quasi-criminal) decisions on CanLII from 2023 as of 2023-01-31. I selected cases based solely on the style of cause, including all cases that followed the style *R v Defendant* or *Defendant v R*. As this is a NLP project, I only selected English decisions, thereby limiting the number of reported (quasi-)criminal cases from Quebec.

In [5]:
# British Columbia

bcca_list = ["./canlii_crim_corpus/html/2023/bc/ca/2023bcca2.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca3.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca4.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca6.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca8.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca13.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca16.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca19.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca29.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca33.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca37.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca38.html",
             "./canlii_crim_corpus/html/2023/bc/ca/2023bcca50.html",]

bcsc_list = ["./canlii_crim_corpus/html/2023/bc/sc/2023bcsc50.html",
             "./canlii_crim_corpus/html/2023/bc/sc/2023bcsc72.html",
             "./canlii_crim_corpus/html/2023/bc/sc/2023bcsc85.html",
             "./canlii_crim_corpus/html/2023/bc/sc/2023bcsc92.html",
             "./canlii_crim_corpus/html/2023/bc/sc/2023bcsc96.html",
             "./canlii_crim_corpus/html/2023/bc/sc/2023bcsc106.html",
             "./canlii_crim_corpus/html/2023/bc/sc/2023bcsc134.html",
             "./canlii_crim_corpus/html/2023/bc/sc/2023bcsc141.html",]

bcpc_list = ["./canlii_crim_corpus/html/2023/bc/pc/2023bcpc3.html",
             "./canlii_crim_corpus/html/2023/bc/pc/2023bcpc4.html",
             "./canlii_crim_corpus/html/2023/bc/pc/2023bcpc5.html",
             "./canlii_crim_corpus/html/2023/bc/pc/2023bcpc6.html",
             "./canlii_crim_corpus/html/2023/bc/pc/2023bcpc7.html",
             "./canlii_crim_corpus/html/2023/bc/pc/2023bcpc11.html",
             "./canlii_crim_corpus/html/2023/bc/pc/2023bcpc12.html",
             "./canlii_crim_corpus/html/2023/bc/pc/2023bcpc13.html",]
       
    
# Alberta

abca_list = ["./canlii_crim_corpus/html/2023/ab/ca/2023abca2.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca3.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca5.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca7.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca10.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca11.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca18.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca20.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca23.html",
             "./canlii_crim_corpus/html/2023/ab/ca/2023abca29.html",]

abkb_list = ["./canlii_crim_corpus/html/2023/ab/kb/2023abkb45.html",
             "./canlii_crim_corpus/html/2023/ab/kb/2023abkb13.html",
             "./canlii_crim_corpus/html/2023/ab/kb/2023abkb26.html",
             "./canlii_crim_corpus/html/2023/ab/kb/2023abkb9.html",
             "./canlii_crim_corpus/html/2023/ab/kb/2023abkb39.html",]

abpc_list = ["./canlii_crim_corpus/html/2023/ab/pc/2023abpc17.html",
             "./canlii_crim_corpus/html/2023/ab/pc/2023abpc3.html",
             "./canlii_crim_corpus/html/2023/ab/pc/2023abpc9.html",
             "./canlii_crim_corpus/html/2023/ab/pc/2023abpc6.html",
             "./canlii_crim_corpus/html/2023/ab/pc/2023abpc1.html",
             "./canlii_crim_corpus/html/2023/ab/pc/2023abpc16.html",
             "./canlii_crim_corpus/html/2023/ab/pc/2023abpc8.html",
             "./canlii_crim_corpus/html/2023/ab/pc/2023abpc7.html",]


# Saskatchewan

skca_list = ["./canlii_crim_corpus/html/2023/sk/ca/2023skca1.html",
             "./canlii_crim_corpus/html/2023/sk/ca/2023skca2.html",
             "./canlii_crim_corpus/html/2023/sk/ca/2023skca12.html",
             "./canlii_crim_corpus/html/2023/sk/ca/2023skca7.html",
             "./canlii_crim_corpus/html/2023/sk/ca/2023skca6.html",]

skkb_list = ["./canlii_crim_corpus/html/2023/sk/kb/2023skkb1.html",
             "./canlii_crim_corpus/html/2023/sk/kb/2023skkb8.html",]

skpc_list = ["./canlii_crim_corpus/html/2023/sk/pc/2023skpc6.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc9.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc5.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc14.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc1.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc3.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc4.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc12.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc8.html",
             "./canlii_crim_corpus/html/2023/sk/pc/2023skpc7.html",]


# Manitoba

mbca_list = ["./canlii_crim_corpus/html/2023/mb/ca/2023mbca5.html",
             "./canlii_crim_corpus/html/2023/mb/ca/2023mbca2.html",
             "./canlii_crim_corpus/html/2023/mb/ca/2023mbca4.html",
             "./canlii_crim_corpus/html/2023/mb/ca/2023mbca6.html",
             "./canlii_crim_corpus/html/2023/mb/ca/2023mbca1.html",]

mbkb_list = ["./canlii_crim_corpus/html/2023/mb/kb/2023mbkb7.html",
             "./canlii_crim_corpus/html/2023/mb/kb/2023mbkb2.html",
             "./canlii_crim_corpus/html/2023/mb/kb/2023mbkb10.html",
             "./canlii_crim_corpus/html/2023/mb/kb/2023mbkb12.html",
             "./canlii_crim_corpus/html/2023/mb/kb/2023mbkb6.html",
             "./canlii_crim_corpus/html/2023/mb/kb/2023mbkb1.html",]

mbpc_list = []


# Ontario

onca_list = ["./canlii_crim_corpus/html/2023/on/ca/2023onca19.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca33.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca40.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca23.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca2.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca24.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca45.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca6.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca10.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca38.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca13.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca48.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca5.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca35.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca8.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca31.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca32.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca47.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca53.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca7.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca20.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca12.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca3.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca36.html",
             "./canlii_crim_corpus/html/2023/on/ca/2023onca4.html",]

onsc_list = ["./canlii_crim_corpus/html/2023/on/sc/2023onsc538.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc414.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc124.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc496.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc103.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc286.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc547.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc254.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc347.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc549.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc62.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc396.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc64.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc283.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc452.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc640.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc220.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc14.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc97.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc268.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc662.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc568.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc621.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc462.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc146.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc296.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc555.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc190.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc200.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc416.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc166.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc567.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc400.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc300.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc519.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc406.html",
             "./canlii_crim_corpus/html/2023/on/sc/2023onsc516.html",]

oncj_list = ["./canlii_crim_corpus/html/2023/on/cj/2023oncj18.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj24.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj10.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj9.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj20.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj25.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj16.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj12.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj45.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj43.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj17.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj28.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj40.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj6.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj14.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj36.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj15.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj11.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj29.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj5.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj31.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj27.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj21.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj4.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj41.html",
             "./canlii_crim_corpus/html/2023/on/cj/2023oncj22.html",]

# Quebec

qcca_list = ["./canlii_crim_corpus/html/2023/qc/ca/2023qcca34.html",
             "./canlii_crim_corpus/html/2023/qc/ca/2023qcca13.html",
             "./canlii_crim_corpus/html/2023/qc/ca/2023qcca89.html",
             "./canlii_crim_corpus/html/2023/qc/ca/2023qcca57.html",]

qccq_list = ["./canlii_crim_corpus/html/2023/qc/cq/2023qccq86.html",
             "./canlii_crim_corpus/html/2023/qc/cq/2023qccq15.html",]

qccs_list = []


# New Brunswick

nbca_list = ["./canlii_crim_corpus/html/2023/nb/ca/2023nbca6.html",]

nbkb_list = []


nbpc_list = ["./canlii_crim_corpus/html/2023/nb/pc/2023nbpc1.html",]

# Newfoundland & Labrador

nlsc_list = ["./canlii_crim_corpus/html/2023/nl/sc/2023nlsc6.html",]

nlpc_list = ["./canlii_crim_corpus/html/2023/nl/pc/2023canlii605.html", 
             "./canlii_crim_corpus/html/2023/nl/pc/2023canlii460.html",
             "./canlii_crim_corpus/html/2023/nl/pc/2023canlii466.html",
             "./canlii_crim_corpus/html/2023/nl/pc/2023canlii2060.html",
             "./canlii_crim_corpus/html/2023/nl/pc/2023canlii2521.html",
             "./canlii_crim_corpus/html/2023/nl/pc/2023canlii3051.html",]

nlca_list = []

# Prince Edward Island

peca_list = ["./canlii_crim_corpus/html/2023/pe/ca/2023peca1.html",
             "./canlii_crim_corpus/html/2023/pe/ca/2023peca2.html",]

pesc_list = ["./canlii_crim_corpus/html/2023/pe/sc/2023peca4.html",]

pepc_list = []


# Nova Scotia

nsca_list = ["./canlii_crim_corpus/html/2023/ns/ca/2023nsca3.html",
             "./canlii_crim_corpus/html/2023/ns/ca/2023nsca2.html",
             "./canlii_crim_corpus/html/2023/ns/ca/2023nsca1.html",]

nssc_list = ["./canlii_crim_corpus/html/2023/ns/sc/2023nssc25.html",
             "./canlii_crim_corpus/html/2023/ns/sc/2023nssc9.html",
             "./canlii_crim_corpus/html/2023/ns/sc/2023nssc3.html",
             "./canlii_crim_corpus/html/2023/ns/sc/2023nssc4.html",
             "./canlii_crim_corpus/html/2023/ns/sc/2023nssc2.html",
             "./canlii_crim_corpus/html/2023/ns/sc/2023nssc28.html",]

nspc_list = []


# Yukon

ykca_list = []

yksc_list = []

yktc_list = ["./canlii_crim_corpus/html/2023/yk/pc/2023yktc1.html",]


# Northwest Territories

nwca_list ["./canlii_crim_corpus/html/2023/nw/ca/2023nwca1.html",]

nwsc_list = []

nwtc_list = []


# Nunavut

nuca_list = []

nucj_list = []


NameError: name 'nwca_list' is not defined

The following code snippet shows the HTML files that will be used to build the first test mini-corpus in a tree format.

In [10]:
import os

def list_directory_tree(directory):
    print(directory)
    for path, dirs, files in os.walk(directory):
        level = path.replace(directory, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(path)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

list_directory_tree("./canlii_crim_corpus/html/2023/")

./canlii_crim_corpus/html/2023/
/
qc/
    ca/
        2023qcca34.html
        2023qcca13.html
        2023qcca89.html
        2023qcca57.html
    cq/
        2023qccq86.html
        2023qccq15.html
    cs/
yk/
    sc/
    ca/
    tc/
        2023yktc1.html
bc/
    pc/
        2023bcpc4.html
        2023bcpc13.html
        2023bcpc12.html
        2023bcpc11.html
        2023bcpc3.html
        2023bcpc7.html
        2023bcpc6.html
        2023bcpc5.html
    sc/
        2023bcsc134.html
        2023bcsc96.html
        2023bcsc85.html
        2023bcsc106.html
        2023bcsc92.html
        2023bcsc50.html
        2023bcsc141.html
        2023bcsc72.html
    ca/
        2023bcca8.html
        2023bcca16.html
        2023bcca29.html
        2023bcca6.html
        2023bcca19.html
        2023bcca2.html
        2023bcca33.html
        2023bcca38.html
        2023bcca13.html
        2023bcca4.html
        2023bcca50.html
        2023bcca37.html
        2023bcca3.html
ab/
    pc/
        2023ab

## Data preprocessing
These functions remove extraneous HTML and save the clean text to file. Where available, the preprocessing functions split the decision into the decision's numbered paragraphs. Where the decision doesn't come with pre-formatted paragraph numbers, the functions should infer them from the document's structure. For some older decisions, it may be possible to infer pagination, though this functionality may not be necessary or useful.

### HTML to TXT
The HTML to TXT functions take a raw HTML file and convert it into an NLTK Text object.The HTML to TXT functions take a raw HTML file and convert it into an NLTK Text object.

### read_html_file
Reads an HTML file and returns it as a BeautifulSoup object. Doing so makes the file much easier to work with in subsequent functions.

### create_title
Creates a title for each decision. Where available, the function uses the neutral citation as the title. Where the neutral citation isn't available, the function uses the CanLII citation as the title.

### decision_paragraphs
Runs through the BS4 object and extracts all of the paragraphs between (and inclusive of) "paragWrapper" divs.

### decision_footnotes
Runs through the BS4 object and extracts all of hte paragraphs containing the "MsoFootnoteReference"MsoFootnoteReference" span.

### clean_text
Tokenizes the text and removes paragraph numbers by default.

### compile_decision_text
Returns a clean and tokenized copy of the decision from the os path provided. The first list item identifies the decision by its neutral or CanLII citation.

In [119]:
import nltk
import re
from bs4 import BeautifulSoup
from nltk.text import Text


# Reads an HTML file and returns a BeautifulSoup object
def read_html_file(filename: str)->BeautifulSoup:
    '''
    Reads an HTML file and returns a BeautifulSoup object.
    '''
    with open(filename, 'r', encoding="utf-8") as file:
        soup: BeautifulSoup = BeautifulSoup(file, 'html.parser')
    return soup

def create_title(filepath: str)-> str:
    """Create a title for the text file from the html file name"""
    path_list = filepath.split("/")
    title_list = path_list[-1].split(".")
    title = title_list[0]
    
    # The first group of numbers is the year
    year = re.findall(r"\d+", title)[0]
    # The second group of numbers is the file number
    file_number = re.findall(r"\d+", title)[1]
    # The group of letters is the jurisdiction and court
    jurisdiction = re.findall(r"[a-z]+", title)[0]
    
    if jurisdiction == "canlii":
        jurisdiction = "CanLII"
        title = f"{year} {jurisdiction} {file_number}"
    else:
        title = f"{year} {jurisdiction.upper()} {file_number}"
 
    return title

# Extracts the decision text
def decision_paragraphs(filename: str)->tuple:
    '''
    Extracts the decision paragraphs. The decision text
    is contained in the <div class="paragWrapper"> tags. This function extracts
    the text from these tags and appends it to a list.
    '''
    
    decision = read_html_file(filename)
    
    # Find the first and last instances of the "paragWrapper" div
    first_div = decision.find("div", class_="paragWrapper")
    last_div = decision.find_all("div", class_="paragWrapper")[-1]

    paragraphs = []
    footnotes = []

    # Iterate over all siblings between the first and last instances of the "paragWrapper" div
    sibling = first_div
    paragraphs.append(first_div)
    while sibling != last_div:
        sibling = sibling.find_next_sibling()
        paragraphs.append(sibling)
        
    # Finds and appends footnotes where applicable
    if decision.find("SPAN", class_="MsoFootnoteReference"):
        decision_footnotes(decision)
        
    return paragraphs, footnotes


def decision_footnotes(decision: str)->list:
    '''
    Generates a list of footnotes in decisions containing them.
    '''
    footnote = decision.find("SPAN", class_="MsoFootnoteReference")
    footnotes.append(footnote)
    while footnote.find_next_sibling("SPAN", class_="MsoFootnoteReference"):
        footnote = footnote.find_next_sibling("SPAN", class_="MsoFootnoteReference")
        footnotes.append(footnote)
    
    return footnotes
    

def clean_text(paragraph: str, remove_para_nums: bool=True)->list:
    '''
    Returns tokenized text. The function can be set to include paragraph 
    numbers for instances where they may provide some semantic value, but 
    defaults to removing them as this generally isn't expected to be the case.
    '''
    words = nltk.word_tokenize(paragraph)
    if remove_para_nums:
        if paragraph[0] == "[" and paragraph[1].isdigit():
            words = words[2:]
            while words[0] != "]":
                words = words[1:]
            words = words[1:]
        return words
    else:
        return words
    
def save_to_file(clean_decision: list, filename: str):
    '''
    Saves a copy of the cleaned decision text to file.
    '''
    file_path_list = filename.split("/")
    del file_path_list[2]
    file_path_list.insert(2,"txt")
    save_path = "/".join(file_path_list)
    
    save_path_corrected = save_path.split(".")
    del save_path_corrected[-1]
    save_path_corrected.append("txt")
    save_path_corrected = ".".join(save_path_corrected)
    
    print(save_path_corrected)
    
    
    with open(save_path_corrected, "w") as f:
        f.write(clean_decision[0] + "\n")
        for paragraph in clean_decision[1]:
            f.write(paragraph + "\n")
            #for token in paragraph:
             #   f.write(token + "\n")

def compile_decision_text(filename)->list:
    decision = decision_paragraphs(filename)[0]
    footnotes = decision_paragraphs(filename)[1]
    clean_decision = []
    
    for paragraph in decision:
        clean_decision.append(clean_text(paragraph.text))

    for item in clean_decision:
        if len(item) == 0:
            clean_decision.remove(item)
        
    if footnotes:
        for footnote in footnotes:
            decision.append(clean_text(footnote, False))
    
    clean_decision.insert(0,create_title(file))
    
    return clean_decision

In [123]:
for file in file_list:
    clean_decision = compile_decision_text(file)
    save_to_file(clean_decision, file)

./canlii_crim_corpus/txt/2023/nl/pc/2023canlii605.txt
./canlii_crim_corpus/txt/2023/nl/pc/2023canlii460.txt
./canlii_crim_corpus/txt/2023/nl/pc/2023canlii466.txt
./canlii_crim_corpus/txt/2023/nl/pc/2023canlii2060.txt
./canlii_crim_corpus/txt/2023/nl/pc/2023canlii2521.txt
./canlii_crim_corpus/txt/2023/nl/pc/2023canlii3051.txt


## Corpus construction
Once the data is cleaned up, sorted out, and saved to file it is added to the corpus.

In [None]:
import nltk
from nltk.text import Text

tokenized_decision_text = []
tokenized_decision_text.append(clean_decision[0])

for paragraph in clean_decision[1:]:
    for word in paragraph:
        tokenized_decision_text.append(word)

## Feature extraction

### N-gram extraction

In [80]:
from nltk import ngrams
list(ngrams(clean_decision[1], 3))

[('Mr.', 'Dyer', 'is'),
 ('Dyer', 'is', 'charged'),
 ('is', 'charged', 'with'),
 ('charged', 'with', '11'),
 ('with', '11', 'offences'),
 ('11', 'offences', 'on'),
 ('offences', 'on', 'a'),
 ('on', 'a', 'single'),
 ('a', 'single', 'information'),
 ('single', 'information', '.'),
 ('information', '.', 'All'),
 ('.', 'All', 'of'),
 ('All', 'of', 'the'),
 ('of', 'the', 'offences'),
 ('the', 'offences', 'except'),
 ('offences', 'except', 'one'),
 ('except', 'one', 'relate'),
 ('one', 'relate', 'to'),
 ('relate', 'to', 'a'),
 ('to', 'a', 'single'),
 ('a', 'single', 'complainant'),
 ('single', 'complainant', 'NM'),
 ('complainant', 'NM', '.'),
 ('NM', '.', 'Mr.'),
 ('.', 'Mr.', 'Dyer'),
 ('Mr.', 'Dyer', 'has'),
 ('Dyer', 'has', 'asked'),
 ('has', 'asked', 'to'),
 ('asked', 'to', 'sever'),
 ('to', 'sever', 'count'),
 ('sever', 'count', '4'),
 ('count', '4', 'from'),
 ('4', 'from', 'the'),
 ('from', 'the', 'information'),
 ('the', 'information', '.'),
 ('information', '.', 'Count'),
 ('.', 'Co

## Model development

## Model assessment

## Model deployment