In [1]:
# %%writefile bill_text_scraper.py
import codecs
import json
import os
import pandas as pd
import numpy as np


In [2]:
def read_jsonl_file(path):
    '''turn a jsonl file into an array of objects'''
    arr = []
    f = codecs.open(path, 'r', 'utf-8')
    for line in f:
        record = json.loads(line.rstrip('\n|\r'))
        arr.append(record)
        
    return arr

In [3]:
bill_list = []
for filename in os.listdir('../data'):
    if filename.startswith('vote_results'):
        print(filename)
        file = read_jsonl_file('../data/{}'.format(filename))
        
        for line in file:
            if (('QUORUM' not in line['issue']) & 
                ('JOURNAL' not in line['issue']) & 
                ('MOTION' not in line['issue']) & 
                ('ADJOURN' not in line['issue'])& 
                (line['issue'] != '')):
                bill_list.append([line['year'], line['issue']])
        

vote_results_1995.jsonl
vote_results_2004.jsonl
vote_results_1997.jsonl
vote_results_2006.jsonl
vote_results_2002.jsonl
vote_results_1993.jsonl
vote_results_2000.jsonl
vote_results_1991.jsonl
vote_results_2007.jsonl
vote_results_1996.jsonl
vote_results_2005.jsonl
vote_results_1994.jsonl
vote_results_2018.jsonl
vote_results_1990.jsonl
vote_results_2001.jsonl
vote_results_1992.jsonl
vote_results_2003.jsonl
vote_results_2016.jsonl
vote_results_2014.jsonl
vote_results_2010.jsonl
vote_results_2009.jsonl
vote_results_1998.jsonl
vote_results_2012.jsonl
vote_results_2015.jsonl
vote_results_2017.jsonl
vote_results_2013.jsonl
vote_results_2011.jsonl
vote_results_1999.jsonl
vote_results_2008.jsonl


In [4]:
cols = ['year', 'issue']
bills = pd.DataFrame(bill_list, columns = cols)

In [5]:
bills.drop_duplicates(inplace = True)
bills.dropna()

Unnamed: 0,year,issue
0,1995,H R 666
2,1995,H R 665
3,1995,H RES 57
4,1995,H R 2
15,1995,H R 400
16,1995,H R 5
45,1995,H J RES 1
53,1995,H CON RES 17
54,1995,H RES 44
68,1995,H RES 38


In [6]:
bills[:3]

Unnamed: 0,year,issue
0,1995,H R 666
2,1995,H R 665
3,1995,H RES 57


In [7]:
# create congress ids for url crawl
cong_id_list = []

for y in range(101, 117):
    if (y - 1)%10 == 0:
        congress_id = '{}st-congress'.format(y)
        cong_id_list.append(congress_id)

    elif (y - 2)%10 == 0:
        congress_id = '{}nd-congress'.format(y)
        cong_id_list.append(congress_id)
    
    elif (y - 3)%10 == 0:
        congress_id = '{}rd-congress'.format(y)
        cong_id_list.append(congress_id)

    else:
        congress_id = '{}th-congress'.format(y)
        cong_id_list.append(congress_id)


years_odd = []
for y in range(1989, 2019, 2):
    years_odd.append(y)

years_even = []
for y in range(1990, 2020, 2):
    years_even.append(y)

    
# create dictionary of years and congress_ids
congress_ids = {}

for y, i in zip(years_odd, cong_id_list):
    congress_ids.update({y:i})

for y, i in zip(years_even, cong_id_list):
    congress_ids.update({y:i})

In [8]:
congress_ids[1999]

'106th-congress'

In [9]:
bills['congress_id'] = None

In [10]:
for i in range(len(bills)):
    bills.iloc[i, 2] = congress_ids[bills.iloc[i, 0]]

In [11]:
bills[:20]

Unnamed: 0,year,issue,congress_id
0,1995,H R 666,104th-congress
2,1995,H R 665,104th-congress
3,1995,H RES 57,104th-congress
4,1995,H R 2,104th-congress
15,1995,H R 400,104th-congress
16,1995,H R 5,104th-congress
45,1995,H J RES 1,104th-congress
53,1995,H CON RES 17,104th-congress
54,1995,H RES 44,104th-congress
68,1995,H RES 38,104th-congress


In [12]:
bill_types = {
    'H R': 'house-bill',
    'H RES': 'house-resolution', 
    'H J RES': 'house-joint-resolution',
    'H CON RES': 'house-concurrent-resolution',
    'S': 'senate-bill', 
    'S RES': 'senate-resolution', 
    'S J RES': 'senate-joint-resolution',
    'S CON RES': 'senate-concurrent-resolution'    
}

In [13]:
bills['bill_type'] = None
for i in range(len(bills)):
    bills.iloc[i, 3] = bill_types[bills.iloc[i, 1].rsplit(' ', 1)[0]]

In [14]:
t = 'H R 666'
t.rsplit(' ', 1)[0]


'H R'