In [1]:
!git clone https://huggingface.co/datasets/iatpp/digitaldemocracy-2015-2018

Cloning into 'digitaldemocracy-2015-2018'...
Updating files: 100% (3/3)
Updating files: 100% (3/3), done.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

#This is where the unzipped corpus file is stored in Colab file storage
CORPUS_FILE_PATH = '/content/DH2024_Corpus_Release/'

#We have 4 state sets
VALID_STATES=["CA", "FL", "NY", "TX"]

#Each state set has 9 csv files for each session year
CSV_FILENAMES=['bills','committeeHearings', 'committeeRosters',
               'committees', 'hearings', 'legislature',
               'people','speeches','videos']

#Load data from CSVs into a Python object to reference later
#Input:
#  Required: file_name (type:String) (Ex: speeches, bills, etc)
#  Optional: states (type:List of Strings or None) (Ex: ["CA"], ["FL,TX"])
#     -If not specified (states=None), function returns data for all states
#  Optional: years (type:List of Ints or None) (Ex:[2018], [2015,2016])
#     -If not specified (years=None), function returns data for all valid years
#Output:
#  Payload (Type: Dict) (Ex: {column_headers:['pid','cid','date'], rows:[[0,2,2018],[2,1,2018]]})
def load_content(file_name, states=None, years=None):
  #Only accept valid states, Corpus only contains data on CA, FL, NY, and TX legislations
  if states is not None and not all(item in VALID_STATES for item in states):
    raise Exception("Invalid State Abbv(s), corpus only contains data on CA, FL, NY, and TX")
  #Only accept valid file names from corpus, like speeches, bills, etc.
  if file_name not in CSV_FILENAMES:
    raise Exception("Invalid filename, must be one of the 9 files provide")
  #Only accept years belonging to a valid legislative session. (2017-2018 for all states, 2015-2016 for CA)
  if years is not None and ((not all(item > 2015 for item in years) and "CA" not in states) or (not all(item <= 2018 for item in years))):
    raise Exception("""Data for requested year not included in corpus.
     Valid session_years are 2017 and 2018 for all states provided. 2015 and 2016 are valid years for CA.""")

  payload = {}
  header_row = True

  #If no states specified, retrieve relevant files for all valid states
  if states is None:
    states = VALID_STATES

  #If no years/session specified, retrieve data for all valid state legislative session years
  if years is None:
    if "CA" in states:
      years= [2015,2016,2017,2018]
    else:
      years = [2017,2018]

  #The following code block operates as follows:
  # For every state and year requested, read the relevant CSV file(s), then
  # load it into a python object (payload) which is returned to user
  for state in states:
    FILE_PATHS = []

    #Build the filepaths to the correct data location given the states and years provided
    #Years 2017 and 2018 are valid inputs that belong to the same 2017-2018 session
    if 2017 in years or 2018 in years:
      FILE_PATHS.append(CORPUS_FILE_PATH + state + "/2017-2018/CSV/" + file_name + ".csv")

    #CA has 2 valid legislative sessions (2015-2016 and 2017-2018)
    #This means the entirety of CA data is located in more than one folder, unlike other states.
    #Looping through a list of filepaths allows us to handle this corner case
    if state == "CA" and (2015 in years or 2016 in years):
      FILE_PATHS.append(CORPUS_FILE_PATH + state + "/2015-2016/CSV/" + file_name + ".csv")

    for FILE_PATH in FILE_PATHS:
      #Open the file to read
      with open(FILE_PATH, newline='') as csvfile:
        rows = csv.reader(csvfile, delimiter=',')
        #Read CSV row by row
        for row in rows:
          #The first row of every CSV we visit is the header row, containing the names for each column
          # We will add this to the payload only once, as every CSV we read after this will be the same headers
          if header_row:
            payload['column_headers'] = row
            #Sets up 'rows' in payload where we will store future records
            payload['rows'] = []
            header_row = False
            continue
          #Load CSV Into payload row by row
          payload['rows'].append(row)

  return payload

#Example Usage:
load_content("legislature", states= ["FL", "TX"], years=[2017])

OverflowError: Python int too large to convert to C long

In [3]:
#Retrieve the Hearing ID, legislative session, and hearing date where a provided Bill ID is discussed
#Input:
#  Required: BID (type:String) (Ex: Full 'FL_20170HB883' or partial '883' bill ID)
#  Optional: partial_match (Type:Boolean) (Default is False)
#  Optional: speeches
#e      -If searching for a bid from a specific state or session, pass in
#         speeches=load_content("speeches", specific states, specific years)
#Output:
#   (Type: List[String, Int, Int, Date]) (Ex: BID, HID, Session, Hearing Date)
#   If no matches exist does not exist in database, returns None
def find_bill_from_bid(bid, partial_match=False, speeches=None):
  #These are the indexes/locations of information within each row
  HID_IDX = 3
  BID_IDX = 4
  SESSION_IDX = 7
  DATE_IDX = 6
  #If data not provided, search through data from all states and sessions
  if speeches is None:
    speeches = load_content("speeches")
  #List of unique HIDS
  HIDS = []
  matches = []
  #For every speech, check if the BID being discussed matches the provided BID
  #If matching, add relevant BID, HID, Session, and Date to matches
  #Append HID to HIDS to prevent duplicates
  for row in speeches['rows']:
    if not partial_match and row[BID_IDX] == bid and row[HID_IDX] not in HIDS:
      matches.append([bid, row[HID_IDX], row[SESSION_IDX], row[DATE_IDX]])
      HIDS.append(row[HID_IDX])
    if partial_match and bid in row[BID_IDX] and row[HID_IDX] not in HIDS:
      matches.append([row[BID_IDX], row[HID_IDX], row[SESSION_IDX], row[DATE_IDX]])
      HIDS.append(row[HID_IDX])

  return matches
#Example usage
find_bill_from_bid('883', partial_match=True, speeches=load_content("speeches", states=["FL"]))

NameError: name 'load_content' is not defined