In [1]:
#XML parser for Canadian Record of Parliamentary Debates (Senate)
#version: M. Wong, 2018-02-23

######################################################################################################################
# OVERVIEW
######################################################################################################################
#Notes:     (1) Python 3.6 Anaconda
#           (2) Encoding UTF-8
#           (3) Runs from parent directory of data folder.
#               Run this file from the folder that contains the html data folder
#               downloaded from www.sencanada.ca [TO DO: Add and Link]

## Initialization

In [2]:
#Load Modules
import os
import re
import fnmatch
import pandas as pd
import numpy as np
import math
import time
import spacy
from datetime import datetime
from bs4 import BeautifulSoup
from spacy.lang.en import English

#Set pandas to display full column widths and full column numbers
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

#Spacy Parser
nlp = spacy.load('en')

In [3]:
#Load Authority File as pandas dataframe
authorityFile_df = pd.read_json("authorityFile/authorityFile_senate.json",orient="columns")
#authorityFile_df

#Get new name column in firstnamelastname format for parsing
authorityFile_df = authorityFile_df.sort_index()
#authorityFile_df

In [4]:
#Open example final speech csv from House
House_final_csv = pd.read_csv("hansardExtractedSpeechesFull.csv", delimiter="\t", nrows=20, index_col=0)
#House_final_csv

In [5]:
## Copy format of House final csv for Senate csv
Senate_final_csv = House_final_csv.head(0).copy()
#display(Senate_final_csv)

#columns_list = list(Senate_final_csv.columns.values)
#print(columns_list)

## Function Declarations

In [6]:
## These functions specifies wanted tags within other tags (ex. <a> within <h1> or <b> within <p>). Not children
## Returns two lists, one of ids and one of lines, denoted by the inside tag that reside in the outside tag (given attributes)
def tag_specifier(soup, outside_tag, inside_tag, class_attr_outside, id_attr_outside, style_attr_outside, class_attr_inside, id_attr_inside):
    return [a[0].get("id") for a in (td.find_all(inside_tag, class_=class_attr_inside, id=id_attr_inside) for td in soup.find_all(outside_tag, class_=class_attr_outside, id=id_attr_outside, style=style_attr_outside)) if a], [a[0] for a in (td.find_all(inside_tag, class_=class_attr_inside, id=id_attr_inside) for td in soup.find_all(outside_tag, class_=class_attr_outside, id=id_attr_outside, style=style_attr_outside)) if a]

def tag_specifier_simple(soup, outside_tag, inside_tag):
    return [a[0].get("id") for a in (td.find_all(inside_tag) for td in soup.find_all(outside_tag)) if a], [a[0] for a in (td.find_all(inside_tag) for td in soup.find_all(outside_tag)) if a]

def tag_specifier_textonly(soup, outside_tag, inside_tag, class_attr_outside, id_attr_outside, style_attr_outside, class_attr_inside, id_attr_inside):
    return [a[0].get("id") for a in (td.find_all(inside_tag, class_=class_attr_inside, id=id_attr_inside) for td in soup.find_all(outside_tag, class_=class_attr_outside, id=id_attr_outside, style=style_attr_outside)) if a], [a[0].get_text() for a in (td.find_all(inside_tag, class_=class_attr_inside, id=id_attr_inside) for td in soup.find_all(outside_tag, class_=class_attr_outside, id=id_attr_outside, style=style_attr_outside)) if a]



def authorityFile_get_ID(firstname, lastname, column):
    index = authorityFile_df.loc[(authorityFile_df["firstName"] == firstname) & (authorityFile_df["lastName"] == lastname)].index[0]
    value = authorityFile_df[column][index]
    if value == "" or value == None:
        return "NA"
    else:
        return value

def authorityFile_get(speakerName, name_id_dict):
    if "Dan " in speakerName:
        speakerName = speakerName.replace("Dan ","Daniel ")
    for lastname in authorityFile_df["lastName"]:
        #print(lastname)
        if lastname in speakerName:
            for firstname in authorityFile_df.loc[authorityFile_df["lastName"] == lastname]["firstName"]:
                #print(firstname)
                if firstname in speakerName:
                    #print(firstname, lastname)
                    ## Get relevant info from authorityFile entry given name
                    
                    parlInfoId = int(authorityFile_get_ID(firstname, lastname, "parlInfoId"))
                    fullName = authorityFile_df[authorityFile_df["parlInfoId"] == parlInfoId]["fullName"].values[0]
                    firstName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstName"].values[0]
                    lastName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["lastName"].values[0]
                    middleName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["middleName"].values[0]
                    sex = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["sex"].values[0]
                    dateOfBirth = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["dateOfBirth"].values[0]    
                    isEstimateDOB = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["isEstimateDOB"].values[0]
                    visibleMinority = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["visibleMinority"].values[0]
                    indigenous = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["indigenous"].values[0]
                    birthProvince = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthProvince"].values[0]
                    birthCountry = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthCountry"].values[0]
                    firstDay = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstDay"].values[0]
                    provOfRiding = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["provOfRiding"].values[0]
                    parlInfoPage = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["parlInfoPage"].values[0]
                    daysInOffice = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["daysInOffice"].values[0]

                    name_id_dict[lastName] = parlInfoId
                    
                    return (speakerName, parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, 
                            visibleMinority, indigenous, birthProvince, birthCountry, firstDay, provOfRiding, 
                            parlInfoPage, daysInOffice, name_id_dict)
                
            """for middlename in authorityFile_df.loc[authorityFile_df["lastName"] == lastname]["middleName"]:
                if middlename in speakerName:
                    #print(firstname, lastname)
                    ## Get relevant info from authorityFile entry given name

                    parlInfoId = int(authorityFile_get_ID(firstname, lastname, "parlInfoId"))
                    fullName = authorityFile_df[authorityFile_df["parlInfoId"] == parlInfoId]["fullName"].values[0]
                    firstName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstName"].values[0]
                    lastName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["lastName"].values[0]
                    middleName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["middleName"].values[0]
                    sex = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["sex"].values[0]
                    dateOfBirth = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["dateOfBirth"].values[0]    
                    isEstimateDOB = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["isEstimateDOB"].values[0]
                    visibleMinority = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["visibleMinority"].values[0]
                    indigenous = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["indigenous"].values[0]
                    birthProvince = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthProvince"].values[0]
                    birthCountry = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthCountry"].values[0]
                    firstDay = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstDay"].values[0]
                    provOfRiding = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["provOfRiding"].values[0]
                    parlInfoPage = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["parlInfoPage"].values[0]
                    daysInOffice = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["daysInOffice"].values[0]

                    name_id_dict[lastName] = parlInfoId

                    return (speakerName, parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, 
                            visibleMinority, indigenous, birthProvince, birthCountry, firstDay, provOfRiding, 
                            parlInfoPage, daysInOffice, name_id_dict)"""
                
    
    else:
        parlInfoId = None
        fullName = None
        firstName = None
        lastName = None
        middleName = None
        sex = None
        age = None
        dateOfBirth = None
        isEstimateDOB = None
        visibleMinority = None
        indigenous = None
        birthProvince = None
        birthCountry = None
        firstDay = None
        provOfRiding = None
        parlInfoPage = None
        daysInOffice = None
        
        return (speakerName, parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority,
                indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, 
                name_id_dict)
    

def speakerName_check(speakerName, name_id_dict, current_speaker):
    #speakerName = speakerName.replace(":","")

    #print(speakerName)                         

    ####### Check last names of authorityFile and see if they match name, and then match first names if 
    ####### multiple last names are found

    ## Speaker (the Senate position) designation converted to current senate Speaker name
    if ("Hon. the Speaker" in speakerName) or ("Hon. the Acting Speaker" in speakerName):
        speakerName = current_speaker
        #print(speakerName)
        speakerName, parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority, indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, name_id_dict = authorityFile_get(speakerName,name_id_dict)
        
    ## Account for some hansard formatting rules
    ## When already introduced, subsequent references sometimes refer to "Senator lastname" 
    ## Therefore, must record introduced senators with dict of parlInfoId and last name
    elif "Senator " in speakerName:
        #print(name_id_dict)
        try:
            parlInfoId = int(name_id_dict[speakerName.split(" ",1)[1]])
            fullName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["fullName"].values[0]
            firstName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstName"].values[0]
            lastName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["lastName"].values[0]
            middleName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["middleName"].values[0]
            sex = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["sex"].values[0]
            dateOfBirth = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["dateOfBirth"].values[0]
            isEstimateDOB = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["isEstimateDOB"].values[0]
            visibleMinority = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["visibleMinority"].values[0]
            indigenous = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["indigenous"].values[0]
            birthProvince = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthProvince"].values[0]
            birthCountry = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthCountry"].values[0]
            firstDay = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstDay"].values[0]
            provOfRiding = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["provOfRiding"].values[0]
            parlInfoPage = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["parlInfoPage"].values[0]
            daysInOffice = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["daysInOffice"].values[0]
        except:
            parlInfoId = None
            fullName = None
            firstName = None
            lastName = None
            middleName = None
            sex = None
            dateOfBirth = None
            isEstimateDOB = None
            visibleMinority = None
            indigenous = None
            birthProvince = None
            birthCountry = None
            firstDay = None
            provOfRiding = None
            parlInfoPage = None
            daysInOffice = None
            
    else:
        speakerName, parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority, indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, name_id_dict = authorityFile_get(speakerName,name_id_dict)                                            

    return (speakerName, parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority,
            indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, 
            name_id_dict)

## BeautifulSoup Parser

In [7]:
#Set list of available parliaments and respective sessions
parliamentNumber_list = [37,38,39,40,41,42] #[35,36]
parliamentSession_list = [1,2,3]

In [8]:
## Write initial interventionId file
previous_interventionId = 0
with open("Data/Hansard/forParser/previous_interventionId.txt", "w") as id_file:
    id_file.write(str(previous_interventionId))

In [9]:
## Remove all strings within <> i.e. HTML tags
pattern = re.compile("<[^>]+>")

In [10]:
#Main parser function definition
def senate_hansard_parser(parliamentNumber,parliamentSession,Senate_final_csv,pattern):
    print("Senate Hansard Parliament "+str(parliamentNumber)+" Session "+str(parliamentSession))
    
    #Use beautifulsoup to parse HTML
    
    try:
        directory = "Data/Hansard/forParser/Senate Hansard Parliament "+str(parliamentNumber)+" Session "+str(parliamentSession)
        
        ## List of lists, which will be appended to final csv
        ## Each sublist is a row in the final csv
        ## Each list encompasses all speeches in a single HTML file (aka in one sitting)
        
        for i,html_file in enumerate(os.listdir(directory)):
            
            ## Use this to skip to specific days for troubleshooting
            if i < 8:
                continue
            else:
                pass
            
            all_rows = []
            filename = os.fsdecode(html_file)
            print("HTML: "+os.path.join(directory, filename))
            print("")
            
            ## Get last known interventionId from file
            with open("Data/Hansard/forParser/previous_interventionId.txt", "r") as id_file:
                previous_interventionId = int(id_file.read())
                #print(previous_interventionId)
            
            #Get date from filename
            dateYMD = (filename.split("_")[2]).split(".")[0]
            #print(dateYMD)
            year, month, day = int(dateYMD.split("-")[0]), int(dateYMD.split("-")[1]), int(dateYMD.split("-")[2])
            #print(year,month,day)
            
            date = datetime(int(year), int(month), int(day))
            date = date.strftime("%B %d %Y")  
            #print(date)
            
            orderOfBusinessRubric_list = []
            subjectOfBusinessTitle_list = []
            with open(os.path.join(directory, filename)) as file:
                soup = BeautifulSoup(file,"html.parser")
                
                
                ## IMPORTANT: Because beautifulsoup does not have line number extraction and Senate hansard HTMLs
                ## do not have a nice format, must do everything manually
                
                ## Get entire HTML as list of lines for parsing
                #print(soup.prettify())
                all_lines = soup.find_all()
                #print(len(all_lines))
                #print(all_lines)
            
                ## This is for parliaments 35-36 as they utilize another different format
                if (parliamentNumber == 35) or (parliamentNumber == 36):
                    orderOfBusinessRubric_list = soup.find_all("h1", class_=None)
                    orderOfBusinessRubric_id_list = [None for a in orderOfBusinessRubric_list]
                    #print(orderOfBusinessRubric_id_list)
                    #print(orderOfBusinessRubric_list)
                    #print("")
                    
                    subjectOfBusinessTitle_list = soup.find_all("h2", class_=None)
                    subjectOfBusinessTitle_id_list = [None for a in subjectOfBusinessTitle_list]
                    #print(subjectOfBusinessTitle_id_list)
                    #print(subjectOfBusinessTitle_list)
                    #print("")
                    
                    subjectOfBusinessQualifier_list = soup.find_all("h3", class_=None)
                    #print(subjectOfBusinessQualifier_list)
                    #print("")

                else:
                    #NOTE: orders of business and subject of business do not have class attributes and are denoted by ids in the <a> tag
                    #(soup, outside_tag, inside_tag, class_attr_outside, id_attr_outside, style_attr_outside, class_attr_inside, id_attr_inside)
                    orderOfBusinessRubric_id_list, orderOfBusinessRubric_list = tag_specifier(soup,"h1","a",None,None,None,None,True)
                    #print(orderOfBusinessRubric_id_list)
                    #print(orderOfBusinessRubric_list)
                    #print("")

                    ## If format is different, this handles them
                    if (orderOfBusinessRubric_id_list == []) and (orderOfBusinessRubric_list == []):
                        orderOfBusinessRubric_id_list = [a.get("id") for a in soup.find_all("h1", id=True, style="text-transform:uppercase;", class_=None)]
                        orderOfBusinessRubric_list = soup.find_all("h1", id=True, style="text-transform:uppercase;", class_=None)
                    #print(orderOfBusinessRubric_id_list)
                    #print(orderOfBusinessRubric_list)
                    #print("")

                    
                    ## Get subject of business
                    subjectOfBusinessTitle_id_list, subjectOfBusinessTitle_list = tag_specifier(soup,"h2","a",None,None,None,None,True) 
                    #print(subjectOfBusinessTitle_id_list)
                    #print(subjectOfBusinessTitle_list)
                    #print("")

                    if (subjectOfBusinessTitle_id_list == []) and (subjectOfBusinessTitle_list == []):
                        subjectOfBusinessTitle_id_list = [a.get("id") for a in soup.find_all("h2", id=True, class_=None)]
                        subjectOfBusinessTitle_list = soup.find_all("h2", id=True, class_=None)
                    #print(subjectOfBusinessTitle_id_list)
                    #print(subjectOfBusinessTitle_list)
                    #print("")
                    
                    
                    ## Get subject of business qualifiers
                    subjectOfBusinessQualifier_list = tag_specifier_simple(soup,"h3","a")[1]
                    #print(subjectOfBusinessQualifier_list)
                    #print("")                

                    if subjectOfBusinessQualifier_list == []:
                        subjectOfBusinessQualifier_list = soup.find_all("h3", class_=None)
                    #print(subjectOfBusinessQualifier_list)
                    #print("")


                ## Get name of current Senate speaker using wildcard string matches
                current_speaker = [a.get_text() for a in soup.find_all("h2", id=None)]
                #print(current_speaker)
                try:
                    current_speaker = (fnmatch.filter(current_speaker,"*Honourable*Speaker*")[0].splitlines())[1]
                except IndexError:
                    current_speaker = (fnmatch.filter(current_speaker,"*Honourable*Speaker")[0].splitlines())[1]
                    
                #print(current_speaker)
                #print("")
                
                ## Get all italicized lines
                #italicized_list = soup.find_all("i", class_=None, align=None)
                #print(italicized_list)
                #print("")
                
                ## Get all speeches (denoted by <p> with no class)
                speech_list = soup.find_all("p", class_=None, align=None)
                #print(speech_list)
                #print("")
                
                ## Get all speaker names (denoted by <p> with <b> inner tag), avoid those within <u> tags
                ## NOTE: the format of the hansard changes to another standard after several sessions, 
                ##       must now account for that too
                
                #speakerName_unfiltered_list = tag_specifier_textonly(soup,"p","b",None,None,True,None,None)[1]
                #speakerName_u_list = tag_specifier_textonly(soup,"p","u",None,None,True,None,None)[1]
                #speakerName_list = [name for name in speakerName_unfiltered_list if name not in speakerName_u_list]
                
                speakerName_list = tag_specifier_textonly(soup,"p","b",None,None,True,None,None)[1]
                p_tag = "<p style=\"margin-top:3mm;margin-right:0mm;margin-bottom:0mm;margin-left:0in;text-indent:0in;\">" 

                #print(speakerName_list)
                #print("")
                
                ## if the speeches now have a style attribute attached to the <p> tag and thus the previous lines 
                ## will not return proper names
                if speakerName_list == []:
                    
                    #speakerName_unfiltered_list = tag_specifier_textonly(soup,"p","b",None,None,None,None,None)[1]
                    #speakerName_u_list = tag_specifier_textonly(soup,"p","u",None,None,None,None,None)[1]
                    #speakerName_list = [name for name in speakerName_unfiltered_list if name not in speakerName_u_list]
                    
                    speakerName_list = tag_specifier_textonly(soup,"p","b",None,None,None,None,None)[1]
                    p_tag = "<p>"
                    
                    #print(speakerName_list)
                    #print("") 
                
                #Set default values
                orderOfBusinessRubric = None
                subjectOfBusinessID = None
                subjectOfBusinessTitle = None
                subjectOfBusinessQualifier = None
                speechId = None
                interventionId = None
                weekday = None
                timeStamp = None
                speakerName = None
                #Party seems difficult to retrieve
                party = None
                speechFiltered = ""
                floorLanguage = "EN"
                
                parlInfoId = None
                fullName = None
                firstName = None
                lastName = None
                middleName = None
                sex = None
                age = None
                dateOfBirth = None
                isEstimateDOB = None
                visibleMinority = None
                indigenous = None
                birthProvince = None
                birthCountry = None
                firstDay = None
                provOfRiding = None
                parlInfoPage = None
                daysInOffice = None
                
                #Not sure how to get mentioned entities, so ignore again for now
                affiliationType = None
                affiliationDbId = None
                mentionedDocumentsTitle = None
                mentionedDocumentsId = None
                mentionedDocumentsType = None
                mentionedEntityName = None
                mentionedEntityId = None
                mentionedEntityType = None
                
                name_id_dict = dict()               
                
                for line in all_lines:
                    #print(line)
                    #print("")
                    ## Order of business, subject of business and speaker names are renewed only when a valid line is encountered
                    ## Otherwise, these values are kept at the previous values
                    
                    if line in orderOfBusinessRubric_list:
                        #print(orderOfBusinessRubric_id_list[orderOfBusinessRubric_list.index(line)])
                        
                        orderOfBusinessRubric = re.sub(pattern, "", str(line))
                        #print(orderOfBusinessRubric) 
                        
                    elif line in subjectOfBusinessTitle_list:
                        subjectOfBusinessID = subjectOfBusinessTitle_id_list[subjectOfBusinessTitle_list.index(line)]
                        #print(subjectOfBusinessID)
                        
                        subjectOfBusinessTitle = re.sub(pattern, "", str(line))
                        #print(subjectOfBusinessTitle)
                        
                        ## Reset qualifier to default
                        subjectOfBusinessQualifier = None
                        
                    elif line in subjectOfBusinessQualifier_list:
                        #subjectOfBusinessQualifier = line.get_text()
                        subjectOfBusinessQualifier = re.sub(pattern, "", str(line))
                        #print(subjectOfBusinessQualifier)
                    
                    #elif line in speakerName_filtered_list:
                    #    print(i, line)
                    #    speakerName = line.get_text()
                        #print(i,speakerName)
    
                    elif line in speech_list:
                        #print(line)
                        #print("")
                        speech = str(line)
                        #speech = line.get_text()
                        #print(speech)
                        
                        if speech == "":
                            pass
                    
                        elif (speakerName == None) and not (speech.startswith(p_tag+"<b>Hon. ")):
                            pass
                        
                        elif speech.startswith(p_tag+"<b>Hon. Senators") or speech.startswith(p_tag+"<b>Some Hon. Senators"):
                            #Do not add to dataframe if there are multiple speakers
                            pass
                        
                        elif "On the Order" in speech:
                            pass

                        ## Detect language changes and other signifiers
                        #elif any(speech in s for s in italicized_list):
                        #    print(speech)
                        
                        ## Attach response identification to qualifiers if applicable
                        elif "(<i>Response to question raised" in speech:
                            try:
                                subjectOfBusinessQualifier = subjectOfBusinessQualifier + " " + re.sub(pattern, "", speech)
                                #print(subjectOfBusinessQualifier)
                                #print("")
                            except Exception:
                                pass
                        elif "[<i>Translation</i>]" in speech:
                            floorLanguage = "FR"
                            #print(floorLanguage)
                            #print("")
                        elif "[<i>English</i>]" in speech:
                            floorLanguage = "EN"
                            #print(floorLanguage)
                            #print("")
                        
                        ## Get speakerNames
                        elif speech.startswith(p_tag+"<b>Hon. ") or speech.startswith(p_tag+"<b>The Hon. ") or speech.startswith(p_tag+"<b>Senator ") or speech.startswith("<b>The Hon. ") or speech.startswith("<b>Hon. ") or speech.startswith("<b>Senator "):
                            #print(speech)
                            ## Collect all info as a series of lists to then append to the final dataframe

                            try:
                                speakerName, speech = speech.split("</b>",1)[0], line.get_text().split(":",1)[1]
                            except IndexError:
                                speakerName = speech.split("</b>",1)[0]
                                speech = ""
                
                            speakerName = speakerName.replace(",","")
                            speakerName = speakerName.replace(":","")
                            #print(speakerName)
                            
                            """
                            try:
                                speech = speech.split(", moved:</p>",1)[1]
                            except Exception:
                                pass
                            
                            try:
                                speech = speech.split("pro tempore:</p>",1)[1]
                            except Exception:
                                pass
                            """
                            
                            ## Remove all strings within <> i.e. HTML tags
                            speakerName = re.sub(pattern, "", speakerName)
                            #speech = re.sub(pattern, "", speech)
                            #print(speech)
                            
                            #if any(speakerName in s for s in speakerName_list):
                                #print(speakerName)
                            speakerName, parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority, indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, name_id_dict = speakerName_check(speakerName, name_id_dict, current_speaker)
                            #else:
                            #    pass
                            
                            # ['parliamentNumber', 'parliamentSession', 'orderOfBusinessRubric', 'subjectOfBusinessTitle', 
                            # 'subjectOfBusinessID', 'subjectOfBusinessQualifier', 'speechId', 'interventionId', 'date', 
                            # 'dateYMD', 'year', 'month', 'day', 'weekday', 'timeStamp', 'speakerName', 'party', 
                            # 'parlInfoId', 'fullName', 'firstName', 'lastName', 'middleName', 'sex', 'age', 'daysInOffice',
                            # 'visibleMinority', 'indigenous', 'dateOfBirth', 'isEstimateDOB', 'birthProvince', 
                            # 'birthCountry', 'firstDay', 'provOfRiding', 'parlInfoPage', 'affiliationType', 
                            # 'affiliationDbId', 'floorLanguage', 'speech', 'speechFiltered', 'mentionedDocumentsTitle', 
                            # 'mentionedDocumentsId', 'mentionedDocumentsType', 'mentionedEntityName', 'mentionedEntityId',
                            # 'mentionedEntityType', 'filename']
                            
                            #print(speech)
                            if speech == "":
                                pass
                            else:
                                speech_nlp = nlp(speech) # converting speech into Spacy doc

                                for token in speech_nlp:
                                    pair = "_".join([token.text, token.tag_])
                                    speechFiltered = " ".join([speechFiltered, pair])

                                row = [parliamentNumber, parliamentSession, orderOfBusinessRubric, subjectOfBusinessTitle,
                                          subjectOfBusinessID, subjectOfBusinessQualifier, speechId, interventionId, date, 
                                          dateYMD, year, month, day, weekday, timeStamp, speakerName, party, parlInfoId,
                                          fullName, firstName, lastName, middleName, sex, age, daysInOffice, visibleMinority,
                                          indigenous, dateOfBirth, isEstimateDOB, birthProvince, birthCountry, firstDay, provOfRiding,
                                          parlInfoPage, affiliationType, affiliationDbId, floorLanguage, speech,
                                          speechFiltered, mentionedDocumentsTitle, mentionedDocumentsId, mentionedDocumentsType, 
                                          mentionedEntityName, mentionedEntityId, mentionedEntityType, filename]

                                ## Reset filtered speech to default
                                speechFiltered = ""

                                #print(df_row)
                                #print(len(df_row))

                                all_rows.append(row)
                                del row

                        else:
                            speech = line.get_text()
                            
                            ## Remove all strings within <> i.e. HTML tags
                            #speech = re.sub(pattern, "", speech)
                            #print(speech)
                            
                            if speech == "":
                                pass
                            else:
                                speech_nlp = nlp(speech) # converting speech into Spacy doc

                                for token in speech_nlp:
                                    pair = "_".join([token.text, token.tag_])
                                    speechFiltered = " ".join([speechFiltered, pair])

                                row = [parliamentNumber, parliamentSession, orderOfBusinessRubric, subjectOfBusinessTitle,
                                          subjectOfBusinessID, subjectOfBusinessQualifier, speechId, interventionId, date, 
                                          dateYMD, year, month, day, weekday, timeStamp, speakerName, party, parlInfoId,
                                          fullName, firstName, lastName, middleName, sex, age, daysInOffice, visibleMinority,
                                          indigenous, dateOfBirth, isEstimateDOB, birthProvince, birthCountry, firstDay, provOfRiding,
                                          parlInfoPage, affiliationType, affiliationDbId, floorLanguage, speech,
                                          speechFiltered, mentionedDocumentsTitle, mentionedDocumentsId, mentionedDocumentsType, 
                                          mentionedEntityName, mentionedEntityId, mentionedEntityType, filename]

                                ## Reset filtered speech to default
                                speechFiltered = ""

                                all_rows.append(row)
                                del row

            
            ## Once the HTML file is parsed, append to csv            
            ## Create new dataframe that aggregates consecutive speeches of the same speaker in the same language 
            ## and in the same subjectOfBusinessTitle/subjectOfBusinessQualifier
            refined_all_rows, row_previous = [], all_rows[0]
            for row in all_rows[1:]:
                
                ## fullName, floorLanguage, subjectOfBusinessTitle and subjectOfBusinessQualifier
                ## are index 18, 36, 3 and 5 respectively
                
                ## If row changes in those areas, append previous row to new frame
                if (row_previous[18] != row[18]) or (row_previous[36] != row[36]) or (row_previous[3] != row[3]) or (row_previous[5] != row[5]):
                    refined_all_rows.append(row_previous)
                    row_previous = row
                    
                ## Else, keep all column values the same as previous row but append speech and speechFiltered
                ## speech and speechFiltered are index 37 and 38 respectively
                else:
                    #print(row_previous)
                    row_previous[37] = row_previous[37] + " " + row[37]
                    row_previous[38] = row_previous[38] + " " + row[38] 
                    #print(row_previous)
            
            ## Add in interventionId and speechId
            ## interventionId is formatted as a 7-digit number
            ## speechId is interventionId appended to date of speech
            ## speechId is index 6, interventionId is index 7, dateYMD is index 9
            
            for row in refined_all_rows:
                previous_interventionId += 1
                #interventionId
                row[7] = str(str(previous_interventionId).zfill(7))
                #speechId
                row[6] = str(row[9])+"-"+row[7]
            
            #print(refined_all_rows)
            
            df_refined_all_rows = pd.DataFrame(refined_all_rows, columns=Senate_final_csv.columns)
            #display(df_refined_all_rows)
            del refined_all_rows
            
            ## Record dataframe to csv
            if os.path.isfile("hansardExtractedSpeechesFull_senate.csv") == True:
                with open("hansardExtractedSpeechesFull_senate.csv","a") as file:
                    df_refined_all_rows.to_csv(file,header=False,index=False)
            else:
                df_refined_all_rows.to_csv("hansardExtractedSpeechesFull_senate.csv",index=False,header=True)
                
            del df_refined_all_rows
            
            ## Record final interventionId to file
            with open("Data/Hansard/forParser/previous_interventionId.txt", "w") as id_file:
                id_file.write(str(previous_interventionId))
            
            break
            
        #display(Senate_final_csv.head(n=20))
    
    #Display error if parliament-session combo does not exist
    except FileNotFoundError:
        print("Error: Parliament and session combination does not exist")
        print("")

## Function Call

In [11]:
for parliamentNumber in parliamentNumber_list:
    for parliamentSession in parliamentSession_list:
        senate_hansard_parser(parliamentNumber,parliamentSession,Senate_final_csv,pattern)

Senate Hansard Parliament 37 Session 1
HTML: Data/Hansard/forParser/Senate Hansard Parliament 37 Session 1/371_0009_2001-02-22.html

Hon. Fernand Robichaud (Deputy Leader of the Government)








The Honourable Daniel Hays, Speaker

Hays
Daniel Hays
Hon. Gérald-A. Beaudoin
Beaudoin
Gérald Beaudoin
Hon. J. Michael Forrestall
Forrestall
J. Forrestall
Hon. Leonard J. Gustafson
Gustafson
Leonard Gustafson
Hon. Lise Bacon
Bacon
Lise Bacon
Hon. Lowell Murray
Murray
Lowell Murray
Hon. Lorna Milne
Milne
Lorna Milne
Hon. Marjory LeBreton
LeBreton
Marjory LeBreton
Hon. Richard H. Kroft
Kroft
Richard Kroft
Hon. Léonce Mercier
Mercier
Léonce Mercier
The Honourable Daniel Hays, Speaker

Hays
Daniel Hays
The Honourable Daniel Hays, Speaker

Hays
Daniel Hays
Hon. Peter A. Stollery




Hon. Nicholas W. Taylor




Hon. Peter A. Stollery




Hon. Peter A. Stollery




Hon. Peter A. Stollery




Hon. Peter A. Stollery




Hon. Peter A. Stollery




Hon. Lise Bacon
Bacon
Lise Bacon
Hon. Lise Bacon
Bacon

Hon. David Tkachuk

David


Hon. Jack Austin (Leader of the Government)




Austin
Jack Austin
Hon. Laurier L. LaPierre
LaPierre
Laurier LaPierre
The Honourable Daniel Hays, Speaker

Hays
Daniel Hays
Hon. Jack Austin (Leader of the Government)




Austin
Jack Austin
Hon. Lowell Murray
Murray
Lowell Murray
Hon. Jack Austin (Leader of the Government)




Austin
Jack Austin
Hon. John Lynch-Staunton (Leader of the Opposition)




Lynch-Staunton
John Lynch-Staunton
Hon. Jack Austin (Leader of the Government)




Austin
Jack Austin
Hon. Gerald J. Comeau
Comeau
Gerald Comeau
Hon. Jack Austin (Leader of the Government)




Austin
Jack Austin
Hon. Gerald J. Comeau
Comeau
Gerald Comeau
Hon. Jack Austin (Leader of the Government)




Austin
Jack Austin
Hon. Herbert O. Sparrow




Hon. Pat Carney
Carney
Pat Carney
Hon. Jean-Robert Gauthier
Gauthier
Jean-Robert Gauthier
Hon. Jack Austin (Leader of the Government)




Austin
Jack Austin
Hon. Brenda M. Robertson




Hon. Jack Austin (Leader of the Go

KeyboardInterrupt: 