## library code

In [28]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

#This is where the unzipped corpus file is stored in Colab file storage
CORPUS_FILE_PATH = 'DH2024_Corpus_Release/'

#We have 4 state sets
VALID_STATES=["CA", "FL", "NY", "TX"]

#Each state set has 9 csv files for each session year
CSV_FILENAMES=['bills','committeeHearings', 'committeeRosters',
               'committees', 'hearings', 'legislature',
               'people','speeches','videos']

#Load data from CSVs into a Python object to reference later
#Input:
#  Required: file_name (type:String) (Ex: speeches, bills, etc)
#  Optional: states (type:List of Strings or None) (Ex: ["CA"], ["FL,TX"])
#     -If not specified (states=None), function returns data for all states
#  Optional: years (type:List of Ints or None) (Ex:[2018], [2015,2016])
#     -If not specified (years=None), function returns data for all valid years
#Output:
#  Payload (Type: Dict) (Ex: {column_headers:['pid','cid','date'], rows:[[0,2,2018],[2,1,2018]]})
def load_content(file_name, states=None, years=None):
  #Only accept valid states, Corpus only contains data on CA, FL, NY, and TX legislations
  if states is not None and not all(item in VALID_STATES for item in states):
    raise Exception("Invalid State Abbv(s), corpus only contains data on CA, FL, NY, and TX")
  #Only accept valid file names from corpus, like speeches, bills, etc.
  if file_name not in CSV_FILENAMES:
    raise Exception("Invalid filename, must be one of the 9 files provide")
  #Only accept years belonging to a valid legislative session. (2017-2018 for all states, 2015-2016 for CA)
  if years is not None and ((not all(item > 2015 for item in years) and "CA" not in states) or (not all(item <= 2018 for item in years))):
    raise Exception("""Data for requested year not included in corpus.
     Valid session_years are 2017 and 2018 for all states provided. 2015 and 2016 are valid years for CA.""")

  payload = {}
  header_row = True

  #If no states specified, retrieve relevant files for all valid states
  if states is None:
    states = VALID_STATES

  #If no years/session specified, retrieve data for all valid state legislative session years
  if years is None:
    if "CA" in states:
      years= [2015,2016,2017,2018]
    else:
      years = [2017,2018]

  #The following code block operates as follows:
  # For every state and year requested, read the relevant CSV file(s), then
  # load it into a python object (payload) which is returned to user
  for state in states:
    FILE_PATHS = []

    #Build the filepaths to the correct data location given the states and years provided
    #Years 2017 and 2018 are valid inputs that belong to the same 2017-2018 session
    if 2017 in years or 2018 in years:
      FILE_PATHS.append(CORPUS_FILE_PATH + state + "/2017-2018/CSV/" + file_name + ".csv")

    #CA has 2 valid legislative sessions (2015-2016 and 2017-2018)
    #This means the entirety of CA data is located in more than one folder, unlike other states.
    #Looping through a list of filepaths allows us to handle this corner case
    if state == "CA" and (2015 in years or 2016 in years):
      FILE_PATHS.append(CORPUS_FILE_PATH + state + "/2015-2016/CSV/" + file_name + ".csv")

    for FILE_PATH in FILE_PATHS:
      #Open the file to read
      with open(FILE_PATH, newline='') as csvfile:
        rows = csv.reader(csvfile, delimiter=',')
        #Read CSV row by row
        for row in rows:
          #The first row of every CSV we visit is the header row, containing the names for each column
          # We will add this to the payload only once, as every CSV we read after this will be the same headers
          if header_row:
            payload['column_headers'] = row
            #Sets up 'rows' in payload where we will store future records
            payload['rows'] = []
            header_row = False
            continue
          #Load CSV Into payload row by row
          payload['rows'].append(row)

  return payload

In [54]:
from datetime import datetime, timedelta
#This is a helper function for calculating transcript times
def add_seconds(start_time, seconds_to_add):
    # Parse the start time
    time_obj = datetime.strptime(start_time, '%H:%M:%S')
    # Add the seconds
    new_time = time_obj + timedelta(seconds=seconds_to_add)
    # Format the new time back to string
    return new_time.strftime('%H:%M:%S')

def get_hearing_ids():
  HID_IDX=0
  hearingz = load_content("hearings")['rows']
  hids = list(map(lambda h: h[HID_IDX], hearingz))
  return hids

def get_hearing_transcript(hid, bid=None, speeches=None):
  PID_IDX=1
  BID_IDX=4
  HID_IDX=3
  VID_START_IDX = 9
  VID_END_IDX = 10
  LAST_NAME_IDX = 14
  FIRST_NAME_IDX = 15
  TEXT_IDX = 16
  STARTING_TIME_IDX = 11
  if speeches is None:
    speeches=load_content("speeches")

  hid=str(hid)

  lines = []
  for row in speeches['rows']:
    if hid == row[HID_IDX] and (bid is None or bid == row[BID_IDX]):
      offset_time = add_seconds("00:00:00", int(row[STARTING_TIME_IDX]))
      #line = "[{}] {} {} (pid {}) speaking: {}".format(offset_time,row[FIRST_NAME_IDX],row[LAST_NAME_IDX],row[PID_IDX],row[TEXT_IDX])
      line = {'video start':row[VID_START_IDX],'video end':row[VID_END_IDX],'offset':offset_time,'bid':row[BID_IDX],
              'first name':row[FIRST_NAME_IDX],'last name':row[LAST_NAME_IDX],'pid':row[PID_IDX],'text':row[TEXT_IDX]}
      lines.append(line)
  return lines

# accepts the output from get_hearing_transcript() and prints a transceript.
def pprint_discussion(transcript_info):
  print("] printing transcript: ")

  prev_video = -1
  for line in transcript_info:
    video = line['video start']
    if video != prev_video:
      print()
      print(f"] Discussing {line['bid']}")
      print(f"] Video: {video}")
      print()
      prev_video = video
    print(f"[{line['offset']}] {line['first name']} {line['last name']}: ")
    print(f"\t{line['text']}")
  print()

## actual code

In [None]:
for hid in get_hearing_ids()[:2]:
    pprint_discussion(get_hearing_transcript(hid))

In [6]:
# Example Usage:
# load_content("legislature", states= ["FL", "TX"], years=[2017])
pprint_discussion(get_hearing_transcript(10003, "CA_201720180SR7"))

] printing transcript: 

] Discussing CA_201720180SR7
] Video: 16523

[00:00:08] Kevin De Leon: 
	If I can have the members please take their seats to our invited guests if they could be so kind enough up in the gallery, as well as behind the columns if you could please take your seats. Sergeants, can we get an extra chair, please, for Senator Lara? Can we have some chair swapping, perhaps? Good morning to each and every one of you, and welcome to the start of the 2017-18 Senate State of the California session. It is indeed an honor and pleasure to receive each and every one of you here today. I want to welcome all of you. This is indeed a glorious, beautiful day. In the greatest state in the greatest country in the world. It's an honor to have each and every one of here today, these Senators. These Senators-elect. The incredible Senator President Pro-Tem emerituses who are here today, too, as well as the family members and loved ones who have graced us with their presence here today. 

In [56]:
pprint_discussion(get_hearing_transcript(10004))

] printing transcript: 

] Discussing CA_201720180HR4
] Video: 16521

[00:35:42] Ian Calderon: 
	Thank you, Mr. Speaker. HR4 Rendon is at the desk, I request unanimous consent to suspend Assembly rule 63 and allow HR4 to be taken up today without reference to file for the purposes of third reading.
[00:35:53] Kevin Mullin: 
	So, second by Mr. Cooper. Ms. Walburn, for what purposes do you rise?
[00:36:19] Marie Waldron: 
	We oppose the warf motion.
[00:36:22] Kevin Mullin: 
	Ms. Waldron is withholding unanimous consent. This is a motion to suspend the rules before the body. Motion by Mr. Calderon, second by Mr. Cooper. Members, this is a procedural vote on the suspension of the rules, takes 41. Clerk will open the roll. Mr. Calderon is asking for an aye vote. Mr. Calderon is asking for an aye vote. Ms. Waldron is asking for a no vote. Ms. Waldron is asking for a no. 41 votes on the rule suspension. Clerk will close roll. Ayes 55, noes 25. The rules are suspended. Mr. Speaker, are you re

In [30]:
ts = get_hearing_transcript(10003, "CA_201720180SR7")

In [31]:
len(ts)

149

In [40]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
def find_self(first, last, speech, idx=None):
    doc = nlp(speech)
    found = False
    for ent in doc.ents:
        if ent.label_ == "PERSON" and \
            (first.lower() in ent.text.lower()  or \
             last.lower() in ent.text.lower()):
            print(first, last, ":", ent)
            found = True

    if found:
        displacy.render(doc, style='ent')
        print(idx if idx is not None else "", speech)
        
for idx, line in enumerate(ts):
    find_self(line['first name'], line['last name'], line['text'], idx)

Alex Padilla : Alex Padilla


11 I, Alex Padilla, Secretary of State of the State of California hereby certify that according to the semi-official results provided to my office by county elections officials as of December 2nd, 2016, the persons whose names are here and after set forth appear to have received the plurality of votes in their respective State Senate districts at the November 8th, 2016 general election. That opposite their respective name are the State Senate districts from which they have, as of this date, appeared to have received a plurality of the votes and the names of the county or counties comprising or forming a portion of said districts. With the name of any county entirely within or comprising the District shown in capital letters.
Tammy Manning : Tammy Manning


26 I, Tammy Manning.
Kevin De Leon : Kevin De Leon


30 I, Kevin De Leon.
Mike Morrell : Morrell


100 Senator Morrell, thank you. It's good to be back, and it's good to see all of you guys today. A lot of good things I read in that resolution from our Senator, thank you very much, but a couple of the comments on the floor I just don't want to get past us. They talked about, one of my friends from San Jose talked about constitutionalism and what all that means. Going back to the actual immigration policies, America's the first nation not founded by a race, nor was it founded by territories. It was founded by immigrants, but there was only one type of person that was supposed to be here, and that was an American person and when Lincoln said it's Americanists, the glue that holds us together, are people having a thorough knowledge of the United States Constitution, the Bill of Rights, and the Declaration of Independence. He said that's the electric cord which binds us together and so some of the things that are said here today are not constitutional, we should stick to that, and that'