# Text Preparation

## Overview

This code collects and prepares textual data for
analysis. As the data source we will debates in the Dáil Éireann (Irish
Parliament) for the first 2 months of 2025. 

## Part 1: Data Acquisition

In this part a scraper is used that collects the data using the Oireachtas API.
The data is collected for the first 2 months of 2025 (January and
February, but the bulk of the debates are in February).

## Part 2: Text Preprocessing

In this part the code cleans up the collected data. 

The ultimate goal is to have a dataset of the following form:

| dail | vol | no  | date | speaker | text | ntokens | ntypes |
|------|-----|-----|------|---------|------|---------|--------|

where:

`dail` - is the number of the Dáil (e.g. 34th Dáil)

`vol` - is the volume number of the debates (e.g. 1000)

`no` - is the number of the debate in the volume (e.g. 1)

`date` - is the date of the debate (in YYYY-MM-DD form, e.g. 2025-01-01)

`speaker` - is the name of the speaker

`text` - is the text of the speech

`ntokens` - is the number of tokens in the speech

`ntypes` - is the number of types in the speech

In [116]:
# Installing necessary packages 
#install.packages("httr")
#install.packages("httr2")
#install.packages("jsonlite")
#install.packages("stopwords")
library(xml2) 
library(httr)  
library(httr2)
library(jsonlite)
library(tidyverse)
library(dplyr)
library(tidytext)
library(stringr)
library(stopwords) 

In [117]:
## Part 1: Data Acquisition

# Defining the URL for the API
base_url <- "https://api.oireachtas.ie/v1/debates"

# Defining the query parameters
date_start <- "2025-01-01"  
date_end <- "2025-02-28"    

# API request with query parameters
response <- request(base_url) %>%
  req_url_query(date_start = date_start, date_end = date_end) %>%
  req_perform()

# Checking the response status and print error
if (resp_status(response) == 200) {
  parsed_json_data <- response %>% resp_body_json() 
  parsed_json_data%>% glimpse()
} else {
  content <- response %>% resp_body_string()
  cat(content)
}

# printing the result 
parsed_json_data

List of 2
 $ head   :List of 3
  ..$ counts   :List of 2
  .. ..$ debateCount: int 20
  .. ..$ resultCount: int 20
  ..$ dateRange:List of 2
  .. ..$ start: chr "2025-01-01T00:00:00.000Z"
  .. ..$ end  : chr "2025-02-28T00:00:00.000Z"
  ..$ lang     : chr "mul"
 $ results:List of 20
  ..$ :List of 2
  .. ..$ debateRecord:List of 9
  .. ..$ contextDate : chr "2025-02-27"
  ..$ :List of 2
  .. ..$ debateRecord:List of 9
  .. ..$ contextDate : chr "2025-02-27"
  ..$ :List of 2
  .. ..$ debateRecord:List of 9
  .. ..$ contextDate : chr "2025-02-26"
  ..$ :List of 2
  .. ..$ debateRecord:List of 9
  .. ..$ contextDate : chr "2025-02-26"
  ..$ :List of 2
  .. ..$ debateRecord:List of 9
  .. ..$ contextDate : chr "2025-02-25"
  ..$ :List of 2
  .. ..$ debateRecord:List of 9
  .. ..$ contextDate : chr "2025-02-25"
  ..$ :List of 2
  .. ..$ debateRecord:List of 9
  .. ..$ contextDate : chr "2025-02-20"
  ..$ :List of 2
  .. ..$ debateRecord:List of 9
  .. ..$ contextDate : chr "2025-02-19"
  ..

In [118]:
# Extracting debate information from a single debate record
extract_debate_info <- function(debateRecord) {    
  # Extracting metadata from the debate record
  date <- debateRecord$debateRecord$date
  house_code <- debateRecord$debateRecord$house$houseCode
  chamber_showAs <- debateRecord$debateRecord$chamber$showAs
  dail <- debateRecord$debateRecord$house$showAs
  xml_url <- debateRecord$debateRecord$formats$xml$uri
  pdf_url <- debateRecord$debateRecord$formats$pdf$uri
  
  # Processing nested debate sections, if present
  debate_sections <- debateRecord$debateRecord$debateSections
  if (length(debate_sections) > 0) {
    sections_info <- map_df(debate_sections, function(section) {
      debate_type <- section$debateSection$debateType
      debate_text <- section$debateSection$text[[1]]$text
      speech_count <- section$debateSection$counts$speechCount
      speaker_count <- section$debateSection$counts$speakerCount
      debate_section_title <- section$debateSection$showAs
      
      # Creating a tibble with extracted debate section data
      tibble(
        date = date,
        house_code = house_code,
        chamber_showAs = chamber_showAs,
        dail = dail,
        xml_url = xml_url,
        pdf_url = pdf_url,
        debate_type = debate_type,
        debate_text = debate_text,
        speech_count = speech_count,
        speaker_count = speaker_count,
        debate_section_title = debate_section_title
      )
    })
    return(sections_info)
  } else {
    return(tibble(
      date = date,
      house_code = house_code,
      chamber_showAs = chamber_showAs,
      dail = dail,
      xml_url = xml_url,
      pdf_url = pdf_url,
      debate_type = NA,
      debate_text = NA,
      speech_count = NA,
      speaker_count = NA,
      debate_section_title = NA
    ))
  }
}

# Applying the extraction function to each debateRecord in the 'results'
debates_df <- map_df(parsed_json_data$results, extract_debate_info)

# Viewing and exploring the extracted data
print(debates_df)
#table(debates_df$date)
#table(debates_df$house_code)
#table(debates_df$chamber_showAs)
#table(debates_df$dail)
#table(debates_df$debate_type)

[90m# A tibble: 344 × 10[39m
   date       house_code chamber_showAs dail        xml_url  pdf_url debate_type
   [3m[90m<chr>[39m[23m      [3m[90m<chr>[39m[23m      [3m[90m<chr>[39m[23m          [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m   [3m[90m<chr>[39m[23m      
[90m 1[39m 2025-02-27 seanad     Seanad Éireann 27th Seanad https:/… https:… debate     
[90m 2[39m 2025-02-27 seanad     Seanad Éireann 27th Seanad https:/… https:… debate     
[90m 3[39m 2025-02-27 seanad     Seanad Éireann 27th Seanad https:/… https:… debate     
[90m 4[39m 2025-02-27 seanad     Seanad Éireann 27th Seanad https:/… https:… debate     
[90m 5[39m 2025-02-27 seanad     Seanad Éireann 27th Seanad https:/… https:… debate     
[90m 6[39m 2025-02-27 seanad     Seanad Éireann 27th Seanad https:/… https:… debate     
[90m 7[39m 2025-02-27 seanad     Seanad Éireann 27th Seanad https:/… https:… debate     
[90m 8[39m 2025-02-27 seanad     Se

In [119]:
# Parsing XML content and combining preface and debate data
parse_debate_xml <- function(xml_url, dail) {
  
  # Downloading XML from URL
  xml_content <- GET(xml_url)
  
  # Parsing the XML content
  xml_data <- read_xml(content(xml_content, as = "text", encoding = "UTF-8"))
  
  # Defining the XML namespace
  ns <- xml_ns_rename(xml_ns(xml_data), d1 = "akn")
  
  # Extracting values from <preface> block elements
  date_en <- xml_text(xml_find_first(xml_data, ".//akn:block[@name='date_en']/akn:docDate", ns))
  volume <- xml_text(xml_find_first(xml_data, ".//akn:block[@name='volume']/akn:docNumber", ns))
  number <- xml_text(xml_find_first(xml_data, ".//akn:block[@name='number']/akn:docNumber", ns))
  
  # Storing preface information in a data frame
  preface_df <- tibble(
    date_en = date_en,
    volume = volume,
    number = number
  )
  
  # Extracting values from <debateBody> 
  debate_sections <- xml_find_all(xml_data, ".//akn:debateSection", ns)
  debate_list <- lapply(debate_sections, function(section) {
    heading <- xml_text(xml_find_first(section, "akn:heading", ns))
    speeches <- xml_find_all(section, "akn:speech", ns)
    
    # Extracting all speeches and their paragraph information
    speech_data <- lapply(speeches, function(speech) {
      speaker <- xml_attr(speech, "by")
      paragraphs <- xml_text(xml_find_all(speech, "akn:p", ns))
      combined_paragraphs <- paste(paragraphs, collapse = " ")
      
      # Returning a tibble for each speech
      tibble(
        speaker = speaker,
        speech = combined_paragraphs
      )
    })
    
    # Combining all speech data into one data frame
    bind_rows(speech_data)
  })
  
  # Combining all debate sections into one data frame
  debate_df <- bind_rows(debate_list)
  
  # Adding preface data (date, volume, and number) and XML URL & Dail info 
  combined_df <- debate_df %>%
    mutate(
      date_en = preface_df$date_en,
      volume = preface_df$volume,
      number = preface_df$number,
      xml_url = xml_url,  # OPTIONAL
      dail = dail        
    )
  
  combined_df
}

# Filtering out ONLY debates related to "dail"
url_data <- debates_df %>% 
  filter(house_code == "dail") %>% 
  select(xml_url,dail) %>% distinct()

# Initializing an empty data frame to store the final combined data
final_combined_df <- tibble()

# Iterating through the XML URLs and parsing each file, passing xml_url and dail
for (i in seq_len(nrow(url_data))) {
  dail_value <- url_data$dail[i]
  
  combined_data <- suppressWarnings(parse_debate_xml(url_data$xml_url[i], dail_value)) %>%
    select(-xml_url) 
  
  final_combined_df <- bind_rows(final_combined_df, combined_data)
}

# Printing the final combined data frame
print(final_combined_df)

[90m# A tibble: 4,571 × 6[39m
   speaker        speech                             date_en volume number dail 
   [3m[90m<chr>[39m[23m          [3m[90m<chr>[39m[23m                              [3m[90m<chr>[39m[23m   [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m
[90m 1[39m #DarrenORourke As this is the first ministerial … Thursd… Vol. … No. 6  34th…
[90m 2[39m #HelenMcEntee  I look forward to working with al… Thursd… Vol. … No. 6  34th…
[90m 3[39m #DarrenORourke I welcome the fact that the Minis… Thursd… Vol. … No. 6  34th…
[90m 4[39m #HelenMcEntee  I think our stakeholders have out… Thursd… Vol. … No. 6  34th…
[90m 5[39m #DarrenORourke We have spent more time identifyi… Thursd… Vol. … No. 6  34th…
[90m 6[39m #HelenMcEntee  The most important thing here is … Thursd… Vol. … No. 6  34th…
[90m 7[39m #RuthCoppinger I know the Minister is new to her… Thursd… Vol. … No. 6  34th…
[90m 8[39m #HelenMcEntee  I thank the Deputy for rais

In [120]:
# Viewing and exploring the extracted combined data
#head(final_combined_df)
#table(final_combined_df$speaker)
#table(final_combined_df$date_en)
#table(final_combined_df$volume)
#table(final_combined_df$number)
#table(final_combined_df$dail)

In [121]:
## Part 2: Text Preprocessing

# Computing ntokens and ntypes
calculate_tokens_types <- function(speech_text) {
  if (is.na(speech_text) || speech_text == "") {
    return(tibble(ntokens = 0, ntypes = 0))
  }
  clean_text <- str_to_lower(speech_text) %>%
    str_replace_all("[^a-z\\s]", "")
  words <- str_split(clean_text, "\\s+")[[1]]
  words <- words[words != "" & !(words %in% stopwords("en"))]  # OPTIONAL 
  tibble(ntokens = length(words), ntypes = length(unique(words)))
}

# Viewing the results 
#summary(final_combined_df %>% select(ntokens, ntypes))

# Note: Stopwords are also removed 

# Cleaning speaker names 
split_name <- function(speaker) {
  gsub("([a-z])([A-Z])", "\\1 \\2", gsub("^#", "", speaker))
}

# Cleaning and preprocessing data - volume, number, speaker 
final_combined_df <- final_combined_df %>%
  mutate(
    speaker = split_name(speaker),
    volume = str_replace(volume, "^Vol\\.\\s*", ""),
    number = str_replace(number, "^No\\.\\s*", "")
  ) %>%
  rowwise() %>%
  mutate(tokens_info = calculate_tokens_types(speech)) %>%
  mutate(ntokens = tokens_info$ntokens, ntypes = tokens_info$ntypes) %>%
  select(-tokens_info) %>%
  ungroup()

# Printing first and last 5 rows
print(head(final_combined_df, 5))
print(tail(final_combined_df, 5))

# Printing dataset dimensions
print(dim(final_combined_df))

# Counting unique speakers
unique_speakers <- final_combined_df %>%
  filter(speaker != "") %>%
  distinct(speaker) %>%
  nrow()
print(paste("Total unique speakers:", unique_speakers))

# Note: There are some missing values in volume and number and speakers (38 missing - no name after #). 

[90m# A tibble: 5 × 8[39m
  speaker        speech               date_en volume number dail  ntokens ntypes
  [3m[90m<chr>[39m[23m          [3m[90m<chr>[39m[23m                [3m[90m<chr>[39m[23m   [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m   [3m[90m<int>[39m[23m  [3m[90m<int>[39m[23m
[90m1[39m Darren ORourke As this is the firs… Thursd… 1063   6      34th…      40     35
[90m2[39m Helen Mc Entee I look forward to w… Thursd… 1063   6      34th…     185    134
[90m3[39m Darren ORourke I welcome the fact … Thursd… 1063   6      34th…      53     45
[90m4[39m Helen Mc Entee I think our stakeho… Thursd… 1063   6      34th…     116     91
[90m5[39m Darren ORourke We have spent more … Thursd… 1063   6      34th…      58     49
[90m# A tibble: 5 × 8[39m
  speaker                 speech      date_en volume number dail  ntokens ntypes
  [3m[90m<chr>[39m[23m                   [3m[90m<chr>[39m[23m       [3m[90m<chr>[39