In [4]:
#XML parser for Canadian Record of Parliamentary Debates (Senate)
#version: M. Wong, 2018-01-18

######################################################################################################################
# OVERVIEW
######################################################################################################################
#Notes:     (1) Python 3.6 Anaconda
#           (2) Encoding UTF-8
#           (3) Runs from parent directory of data folder.
#               Run this file from the folder that contains the html data folder
#               downloaded from www.sencanada.ca [TO DO: Add and Link]

## Initialization

In [5]:
#Load Modules
'''import os
import xml.etree.ElementTree as ET
import re
import glob
import pandas as pd
import time
from spacy.lang.en import English
from datetime import datetime
import spacy'''

import os
import pandas as pd
import numpy as np
import math
import time
import spacy
from datetime import datetime
from bs4 import BeautifulSoup
from spacy.lang.en import English

#Set pandas to display full column widths and full column numbers
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

#Spacy Parser
nlp = spacy.load('en')

In [6]:
#Load Authority File as pandas dataframe
authorityFile_df = pd.read_json("authorityFile/authorityFile_senate.json",orient="columns")
#authorityFile_df

#Get new name column in firstnamelastname format for parsing
authorityFile_df = authorityFile_df.sort_index()
#authorityFile_df

In [7]:
#Open example final speech csv from House
House_final_csv = pd.read_csv("hansardExtractedSpeechesFull.csv", delimiter="\t", nrows=20, index_col=0)
#House_final_csv

# Final CSV Load

In [8]:
try:
    Senate_final_csv = pd.read_csv("hansardExtractedSpeechesFull_senate.csv")
    
except:
    #Copy format of House final csv for Senate csv
    Senate_final_csv = House_final_csv.head(0).copy()
    #display(Senate_final_csv)

    #columns_list = list(Senate_final_csv.columns.values)
    #print(columns_list)

## Function Declarations

In [19]:
## These functions specifies wanted tags within other tags (ex. <a> within <h1> or <b> within <p>). Not children
## Returns two lists, one of ids and one of lines, denoted by the inside tag that reside in the outside tag (given attributes)
def tag_specifier(soup, outside_tag, inside_tag, class_attr_outside, id_attr_outside, class_attr_inside, id_attr_inside):
    return [a[0].get("id") for a in (td.find_all(inside_tag, class_=class_attr_outside, id=id_attr_inside) for td in soup.find_all(outside_tag, class_=class_attr_outside, id=id_attr_outside)) if a], [a[0] for a in (td.find_all(inside_tag, class_=class_attr_outside, id=id_attr_inside) for td in soup.find_all(outside_tag, class_=class_attr_outside, id=id_attr_outside)) if a]

def tag_specifier_simple(soup, outside_tag, inside_tag):
    return [a[0].get("id") for a in (td.find_all(inside_tag) for td in soup.find_all(outside_tag)) if a], [a[0] for a in (td.find_all(inside_tag) for td in soup.find_all(outside_tag)) if a]

                

def authorityFile_get_ID(firstname, lastname, column):
    index = authorityFile_df.loc[(authorityFile_df["firstName"] == firstname) & (authorityFile_df["lastName"] == lastname)].index[0]
    value = authorityFile_df[column][index]
    if value == "" or value == None:
        return "NA"
    else:
        return value

def authorityFile_get(speakerName, name_id_dict):
    for lastname in authorityFile_df["lastName"]:
        #print(lastname)
        if lastname in speakerName:
            for firstname in authorityFile_df.loc[authorityFile_df["lastName"] == lastname]["firstName"]:
                #print(firstname)
                if firstname in speakerName:
                    #print(firstname, lastname)
                    ## Get relevant info from authorityFile entry given name
                    
                    parlInfoId = int(authorityFile_get_ID(firstname, lastname, "parlInfoId"))
                    fullName = authorityFile_df[authorityFile_df["parlInfoId"] == parlInfoId]["fullName"].values[0]
                    firstName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstName"].values[0]
                    lastName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["lastName"].values[0]
                    middleName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["middleName"].values[0]
                    sex = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["sex"].values[0]
                    dateOfBirth = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["dateOfBirth"].values[0]    
                    isEstimateDOB = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["isEstimateDOB"].values[0]
                    visibleMinority = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["visibleMinority"].values[0]
                    indigenous = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["indigenous"].values[0]
                    birthProvince = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthProvince"].values[0]
                    birthCountry = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthCountry"].values[0]
                    firstDay = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstDay"].values[0]
                    provOfRiding = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["provOfRiding"].values[0]
                    parlInfoPage = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["parlInfoPage"].values[0]
                    daysInOffice = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["daysInOffice"].values[0]

                    name_id_dict[lastName] = parlInfoId
                    
                    return (parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, 
                            visibleMinority, indigenous, birthProvince, birthCountry, firstDay, provOfRiding, 
                            parlInfoPage, daysInOffice, name_id_dict)
    
    else:
        parlInfoId = None
        fullName = None
        firstName = None
        lastName = None
        middleName = None
        sex = None
        age = None
        dateOfBirth = None
        isEstimateDOB = None
        visibleMinority = None
        indigenous = None
        birthProvince = None
        birthCountry = None
        firstDay = None
        provOfRiding = None
        parlInfoPage = None
        daysInOffice = None
        
        return (parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority,
                indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, 
                name_id_dict)
    

def speakerName_check(speakerName):
    speakerName = speakerName.replace(":","")

    #print(speakerName)
    #print(line)                            

    ####### Check last names of authorityFile and see if they match name, and then match first names if 
    ####### multiple last names are found

    ## Speaker (the Senate position) designation converted to current senate Speaker name
    if "Hon. the Speaker" in speakerName:
        speakerName = current_speaker[0].get_text()                        
        parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority, indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, name_id_dict = authorityFile_get(speakerName,name_id_dict)

    ## Account for some hansard formatting rules
    ## When already introduced, subsequent references sometimes refer to "Senator lastname" 
    ## Therefore, must record introduced senators with dict of parlInfoId and last name
    elif "Senator " in speakerName:
        #print(name_id_dict)
        try:
            parlInfoId = int(name_id_dict[speakerName.split(" ")[1]])
            fullName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["fullName"].values[0]
            firstName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstName"].values[0]
            lastName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["lastName"].values[0]
            middleName = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["middleName"].values[0]
            sex = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["sex"].values[0]
            dateOfBirth = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["dateOfBirth"].values[0]
            isEstimateDOB = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["isEstimateDOB"].values[0]
            visibleMinority = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["visibleMinority"].values[0]
            indigenous = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["indigenous"].values[0]
            birthProvince = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthProvince"].values[0]
            birthCountry = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["birthCountry"].values[0]
            firstDay = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["firstDay"].values[0]
            provOfRiding = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["provOfRiding"].values[0]
            parlInfoPage = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["parlInfoPage"].values[0]
            daysInOffice = authorityFile_df.loc[authorityFile_df["parlInfoId"] == parlInfoId]["daysInOffice"].values[0]
        except:
            pass

    else:
        parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority, indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, name_id_dict = authorityFile_get(speakerName,name_id_dict)                                            

    return (parlInfoId, fullName, firstName, lastName, middleName, sex, dateOfBirth, isEstimateDOB, visibleMinority,
            indigenous, birthProvince, birthCountry, firstDay, provOfRiding, parlInfoPage, daysInOffice, 
            name_id_dict)

## BeautifulSoup Parser

In [10]:
#Set list of available parliaments and respective sessions
parliamentNumber_list = [42]#[42,41,40,39,38,37,36,35]
parliamentSession_list = [1]#[3,2,1]

In [20]:
#Main parser function definition
def senate_hansard_parser(parliamentNumber,parliamentSession,Senate_final_csv):
    print("Senate Hansard Parliament "+str(parliamentNumber)+" Session "+str(parliamentSession))
    
    #Use beautifulsoup to parse HTML
    
    try:
        directory = "Data/Hansard/forParser/Senate Hansard Parliament "+str(parliamentNumber)+" Session "+str(parliamentSession)
        
        ## List of lists, which will be appended to final csv
        ## Each sublist is a row in the final csv
        ## Each list encompasses all speeches in a single HTML file (aka in one sitting)
        all_rows = []
        
        for html_file in os.listdir(directory):
            filename = os.fsdecode(html_file)
            print("HTML: "+os.path.join(directory, filename))
            print("")
            
            #Get date from filename
            dateYMD = (filename.split("_")[2]).split(".")[0]
            #print(dateYMD)
            year, month, day = int(dateYMD.split("-")[0]), int(dateYMD.split("-")[1]), int(dateYMD.split("-")[2])
            #print(year,month,day)
            
            date = datetime(int(year), int(month), int(day))
            date = date.strftime("%B %d %Y")  
            #print(date)
            
            orderOfBusinessRubric_list = []
            subjectOfBusinessTitle_list = []
            with open(os.path.join(directory, filename)) as file:
                soup = BeautifulSoup(file,"html.parser")
                
                
                ## IMPORTANT: Because beautifulsoup does not have line number extraction and Senate hansard HTMLs
                ## do not have a nice format, must do everything manually
                
                ## Get entire HTML as list of lines for parsing
                #print(soup.prettify())
                all_lines = soup.find_all()
                #print(len(all_lines))
                #print(all_lines)
            
                #NOTE: orders of business and subject of business do not have class attributes and are denoted by ids in the <a> tag
                orderOfBusinessRubric_id_list, orderOfBusinessRubric_list = tag_specifier(soup,"h1","a",None,None,None,True)
                #print(orderOfBusinessRubric_id_list)
                #print(orderOfBusinessRubric_list)
                #print("")
                
                subjectOfBusinessTitle_id_list, subjectOfBusinessTitle_list = tag_specifier(soup,"h2","a",None,None,None,True) 
                #print(subjectOfBusinessTitle_id_list)
                #print(subjectOfBusinessTitle_list)
                #print("")
                
                ## Get name of current Senate speaker 
                current_speaker = tag_specifier_simple(soup,"h2","span")[1]
                #print(current_speaker)
                #print("")
                
                ## Get subject of business qualifiers
                subjectOfBusinessQualifier_list = tag_specifier_simple(soup,"h3","a")[1]
                #print(subjectOfBusinessQualifier_list)
                #print("")                
                
                ## Get all italicized lines
                italicized_list = soup.find_all("i", class_=None, align=None, text=True)
                #print(italicized_list)
                #print("")
                
                ## Get all speeches (denoted by <p> with no class)
                speech_list = soup.find_all("p", class_=None, align=None)
                #print(speech_list)
                #print("")
                
                ## Get all speaker names (denoted by <p> with <b> inner tag)
                #speakerName_list = tag_specifier(soup,"p","b",None,None,None,None)[1]
                #print(speakerName_list)
                #print("")
                
                #Set default values
                orderOfBusinessRubric = None
                subjectOfBusinessID = None
                subjectOfBusinessTitle = None
                subjectOfBusinessQualifier = None
                speechId = None
                interventionId = None
                weekday = None
                timeStamp = None
                speakerName = None
                #Party seems difficult to retrieve
                party = None
                speechFiltered = ""
                floorLanguage = "EN"
                
                parlInfoId = None
                fullName = None
                firstName = None
                lastName = None
                middleName = None
                sex = None
                age = None
                dateOfBirth = None
                isEstimateDOB = None
                visibleMinority = None
                indigenous = None
                birthProvince = None
                birthCountry = None
                firstDay = None
                provOfRiding = None
                parlInfoPage = None
                daysInOffice = None
                
                #Not sure how to get mentioned entities, so ignore again for now
                affiliationType = None
                affiliationDbId = None
                mentionedDocumentsTitle = None
                mentionedDocumentsId = None
                mentionedDocumentsType = None
                mentionedEntityName = None
                mentionedEntityId = None
                mentionedEntityType = None
                
                name_id_dict = dict()               
                
                for i, line in enumerate(all_lines):
                    #print(line)
                    ## Order of business, subject of business and speaker names are renewed only when a valid line is encountered
                    ## Otherwise, these values are kept at the previous values
                    
                    if line in orderOfBusinessRubric_list:
                        #print(orderOfBusinessRubric_id_list[orderOfBusinessRubric_list.index(line)])
                        
                        orderOfBusinessRubric = line.get_text()
                        #print(orderOfBusinessRubric) 
                        
                    elif line in subjectOfBusinessTitle_list:
                        subjectOfBusinessID = subjectOfBusinessTitle_id_list[subjectOfBusinessTitle_list.index(line)]
                        #print(subjectOfBusinessID)
                        
                        subjectOfBusinessTitle = line.get_text()
                        #print(subjectOfBusinessTitle)
                        
                        ## Reset qualifier and language to default
                        subjectOfBusinessQualifier = None
                        floorLanguage = "EN"
                        
                    elif line in subjectOfBusinessQualifier_list:
                        subjectOfBusinessQualifier = line.get_text()
                        #print(subjectOfBusinessQualifier)
                        
                    elif line in italicized_list:
                        italic = line.get_text()
                        #print(italic)
                        ## Attach response identification to qualifiers if applicable
                        if "Response to question raised" in italic:
                            try:
                                subjectOfBusinessQualifier = subjectOfBusinessQualifier + " " + italic
                                #print(subjectOfBusinessQualifier)
                                #print("")
                            except:
                                pass
                        elif "Translation" == italic:
                            floorLanguage = "FR"
                            #print(floorLanguage)
                            #print("")
                        elif "English" == italic:
                            floorLanguage = "EN"
                            #print(floorLanguage)
                            #print("")
    
                    elif line in speech_list:
                        #print(i, line)
                        speech = line.get_text()
                        #print(speakerName)
                    
                        if (speakerName == None) and not (speech.startswith("Hon. ")):
                            pass
                        
                        elif speech.startswith("Hon. Senators"):
                            #Do not add to dataframe if there are multiple speakers
                            pass
                        
                        elif "Response to question raised" in speech:
                            pass
                        
                        elif "On the Order:" in speech:
                            pass
                        
                        ## Collect all info as a series of lists to then append to the final dataframe

                        # ['parliamentNumber', 'parliamentSession', 'orderOfBusinessRubric', 'subjectOfBusinessTitle', 
                        # 'subjectOfBusinessID', 'subjectOfBusinessQualifier', 'speechId', 'interventionId', 'date', 
                        # 'dateYMD', 'year', 'month', 'day', 'weekday', 'timeStamp', 'speakerName', 'party', 
                        # 'parlInfoId', 'fullName', 'firstName', 'lastName', 'middleName', 'sex', 'age', 'daysInOffice',
                        # 'visibleMinority', 'indigenous', 'dateOfBirth', 'isEstimateDOB', 'birthProvince', 
                        # 'birthCountry', 'firstDay', 'provOfRiding', 'parlInfoPage', 'affiliationType', 
                        # 'affiliationDbId', 'floorLanguage', 'speech', 'speechFiltered', 'mentionedDocumentsTitle', 
                        # 'mentionedDocumentsId', 'mentionedDocumentsType', 'mentionedEntityName', 'mentionedEntityId',
                        # 'mentionedEntityType', 'filename']

                        ## Catch initial 
                        elif speech.startswith("Hon. "):
                            speakerName, speech = speech.split(": ",1)[0], speech.split(": ",1)[1]
                            #print(speakerName)
                            
                            print(speech)
                            speech_nlp = nlp(speech) # converting speech into Spacy doc

                            for token in speech_nlp:
                                pair = "_".join([token.text, token.tag_])
                                speechFiltered = " ".join([speechFiltered, pair])

                            row = [parliamentNumber, parliamentSession, orderOfBusinessRubric, subjectOfBusinessTitle,
                                      subjectOfBusinessID, subjectOfBusinessQualifier, speechId, interventionId, date, 
                                      dateYMD, year, month, day, weekday, timeStamp, speakerName, party, parlInfoId,
                                      fullName, firstName, lastName, middleName, sex, age, daysInOffice, visibleMinority,
                                      indigenous, dateOfBirth, isEstimateDOB, birthProvince, birthCountry, firstDay, provOfRiding,
                                      parlInfoPage, affiliationType, affiliationDbId, floorLanguage, speech,
                                      speechFiltered, mentionedDocumentsTitle, mentionedDocumentsId, mentionedDocumentsType, 
                                      mentionedEntityName, mentionedEntityId, mentionedEntityType, filename]

                            ## Reset filtered speech to default
                            speechFiltered = ""

                            #print(df_row)
                            #print(len(df_row))

                            all_rows.append(row)
                            del row
                        
                        else:
                            print(speech)
                            speech_nlp = nlp(speech) # converting speech into Spacy doc

                            for token in speech_nlp:
                                pair = "_".join([token.text, token.tag_])
                                speechFiltered = " ".join([speechFiltered, pair])

                            row = [parliamentNumber, parliamentSession, orderOfBusinessRubric, subjectOfBusinessTitle,
                                      subjectOfBusinessID, subjectOfBusinessQualifier, speechId, interventionId, date, 
                                      dateYMD, year, month, day, weekday, timeStamp, speakerName, party, parlInfoId,
                                      fullName, firstName, lastName, middleName, sex, age, daysInOffice, visibleMinority,
                                      indigenous, dateOfBirth, isEstimateDOB, birthProvince, birthCountry, firstDay, provOfRiding,
                                      parlInfoPage, affiliationType, affiliationDbId, floorLanguage, speech,
                                      speechFiltered, mentionedDocumentsTitle, mentionedDocumentsId, mentionedDocumentsType, 
                                      mentionedEntityName, mentionedEntityId, mentionedEntityType, filename]

                            ## Reset filtered speech to default
                            speechFiltered = ""

                            all_rows.append(row)
                            del row

                            return all_rows

            
            ## Once the HTML file is parsed, append to csv            
            ## Create new dataframe that aggregates consecutive speeches of the same speaker in the same language 
            ## and in the same subjectOfBusinessTitle/subjectOfBusinessQualifier
            refined_all_rows, row_previous = [], all_rows[0]
            for row in all_rows[1:]:
                
                ## fullName, floorLanguage, subjectOfBusinessTitle and subjectOfBusinessQualifier
                ## are index 18, 36, 3 and 5 respectively
                
                ## If row changes in those areas, append previous row to new frame
                if (row_previous[18] != row[18]) or (row_previous[36] != row[36]) or (row_previous[3] != row[3]) or (row_previous[5] != row[5]):
                    refined_all_rows.append(row_previous)
                    row_previous = row
                    
                ## Else, keep all column values the same as previous row but append speech and speechFiltered
                ## speech and speechFiltered are index 37 and 38 respectively
                else:
                    #print(row_previous)
                    row_previous[37] = row_previous[37] + " " + row[37]
                    row_previous[38] = row_previous[38] + " " + row[38] 
                    #print(row_previous)
            
            #print(refined_all_rows)
            
            #df_refined_all_rows = pd.DataFrame(refined_all_rows, columns=Senate_final_csv.columns)
            #del refined_all_rows
            
            #Senate_final_csv = Senate_final_csv.append(df_refined_all_rows)
            
            break
            
        #display(Senate_final_csv)
    
    #Display error if parliament-session combo does not exist
    except FileNotFoundError:
        print("Error: Parliament and session combination does not exist")
        print("")

## Function Call

In [21]:
for parliamentNumber in parliamentNumber_list:
    for parliamentSession in parliamentSession_list:
        senate_hansard_parser(parliamentNumber,parliamentSession,Senate_final_csv)

Senate Hansard Parliament 42 Session 1
HTML: Data/Hansard/forParser/Senate Hansard Parliament 42 Session 1/421_100_2017-03-01.html

Honourable 
senators, I have the distinct honour to rise today as Co-chair of the 
Canada-Bulgaria Inter-Parliamentary Friendship Group to mark the one hundred and 
thirty-ninth anniversary of Bulgaria's Liberation Day on Friday, March 3.


NameError: name 'speech' is not defined