In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from my_tools import get_bill_data, process_corpus
import matplotlib.pyplot as plt

In [5]:
client = MongoClient()
db = client.bills
bill_info = db.bill_info

# monitoring progress of data into Mongo
print('Number of documents in database:\t{}'.format(bill_info.count_documents({})))

records_with_text = bill_info.count_documents({'body': {'$regex': '(.+)'}})
print('Documents with bill text:\t\t{}'.format(records_with_text))

records_wo_text = bill_info.count_documents({'body': None})
print('Documents without bill text:\t\t{}'.format(records_wo_text))

records_with_amend_count = bill_info.count_documents({'num_of_amendments': {'$regex': '(.+)'}})
print('Documents with amend count:\t\t{}'.format(records_with_amend_count))

records_wo_amend_count = bill_info.count_documents({'num_of_amendments': None})
print('Documents without amend count:\t\t{}'.format(records_wo_amend_count))


Number of documents in database:	63348
Documents with bill text:		63338
Documents without bill text:		10
Documents with amend count:		49123
Documents without amend count:		14225


In [3]:
data = get_bill_data()

data.describe()

------------------
------------------
Data includes bills, joints resolutions, and laws with text from the 110th Congress (2007) to present
Make changes in my_tools.get_bill_data to modify the data set.
------------------


Unnamed: 0,num_of_cosponsors,bill_char_counts,intro_month,session
count,52366.0,52366.0,52366.0,52366.0
mean,12.265057,15981.59,5.520643,1.612134
std,29.22665,72200.81,3.265627,0.487268
min,0.0,432.0,1.0,1.0
25%,0.0,2181.0,3.0,1.0
50%,2.0,4495.5,5.0,2.0
75%,11.0,10997.5,8.0,2.0
max,432.0,2758251.0,12.0,2.0


In [None]:
# get mongoo data and convert mongo query resuls to dataframe
# need to execute query (.find) everytime i refer to it?
records_with_text = bill_info.find({'body': {'$regex': '(.+)'}})
data = pd.DataFrame(list(records_with_text))


In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
# DATA CLEANUP
# filter out simple resolutions, concurrent resolutions, and amendments (for prelim model)
data = data[(data['leg_type'] != 'RESOLUTION') & (data['leg_type'] != 'CONCURRENT RESOLUTION') & (data['leg_type'] != 'AMENDMENT')].copy()


In [None]:
data.shape

In [None]:
# create column for character counts of the bill text
bill_lengths = list(map(lambda x: len(x), data['body']))
data['bill_char_counts'] = bill_lengths


In [None]:
import datetime
# convert date column to type datetime
data['intro_date'] = data['intro_date'].apply(lambda x: datetime.datetime.strptime(x[:10], '%m/%d/%Y'))

In [None]:

# strip out month from intro date
data['intro_month'] = data['intro_date'].apply(lambda x: x.month)

# get session from year (odd years are Session 1, even years are Session 2)
data['session'] = data['congress_id'].apply(lambda x: 2 if int(x[:3])%2 == 0 else 1)

# filter out non-numeric num_of_cosponsors: S. Rept. 110-184, TXT, All Actions
data = data[(data['num_of_cosponsors'] != 'S. Rept. 110-184') &
           (data['num_of_cosponsors'] != 'TXT') &
           (data['num_of_cosponsors'] != 'All Actions')].copy()

# convert num_of_cosponsors to numeric
data['num_of_cosponsors'] = data['num_of_cosponsors'].apply(pd.to_numeric)

In [None]:
data.shape

In [None]:
# create column for getting char_counts into buckets
data['char_count_bucket'] = None

d_0 = data[data['bill_char_counts'] <= 1000].copy()
d_1000 = data[(data['bill_char_counts'] > 1000) & (data['bill_char_counts'] <= 2000)].copy()
d_2000 = data[(data['bill_char_counts'] > 2000) & (data['bill_char_counts'] <= 3000)].copy()
d_3000 = data[(data['bill_char_counts'] > 3000) & (data['bill_char_counts'] <= 4000)].copy()
d_4000 = data[(data['bill_char_counts'] > 4000) & (data['bill_char_counts'] <= 5000)].copy()
d_5000 = data[(data['bill_char_counts'] > 5000) & (data['bill_char_counts'] <= 6000)].copy()
d_6000 = data[(data['bill_char_counts'] > 6000) & (data['bill_char_counts'] <= 7000)].copy()
d_7000 = data[(data['bill_char_counts'] > 7000) & (data['bill_char_counts'] <= 8000)].copy()
d_8000 = data[(data['bill_char_counts'] > 8000) & (data['bill_char_counts'] <= 9000)].copy()
d_9000 = data[(data['bill_char_counts'] > 9000) & (data['bill_char_counts'] <= 10000)].copy()
d_10000 = data[data['bill_char_counts'] > 10000].copy()


d_0['char_count_bucket'] = 'less than 1000'
d_1000['char_count_bucket'] = '1001 - 2000'
d_2000['char_count_bucket'] = '2001 - 3000'
d_3000['char_count_bucket'] = '3001 - 4000'
d_4000['char_count_bucket'] = '4001 - 5000'
d_5000['char_count_bucket'] = '5001 - 6000'
d_6000['char_count_bucket'] = '6001 - 7000'
d_7000['char_count_bucket'] = '7001 - 8000'
d_8000['char_count_bucket'] = '8001 - 9000'
d_9000['char_count_bucket'] = '9001 - 10000'
d_10000['char_count_bucket'] = 'greater than 10000'

data = pd.concat([d_0, d_1000, d_2000, d_3000, d_4000, d_5000, 
                  d_6000, d_7000, d_8000, d_9000, d_10000])


In [None]:
data.shape

In [None]:

# LABELING
# break up dataframe into those that became law and others (did not or still pending)
became_law = data[(data['bill_status'] == 'Became Law') | (data['bill_status'] == 'Became Private Law')].copy()
others = data[(data['bill_status'] != 'Became Law') & (data['bill_status'] != 'Became Private Law')].copy()

became_law.loc[:, 'labels'] = 1


In [None]:
print(became_law.shape, others.shape)

In [None]:
# break up others into current congress and previous ones. Anything that hasn't been signed into law
# before current session is dead. Currently, all bills vetoed by the president come from previous congresses
current_cong = others[others['congress_id'] == '115'].copy()
prev_cong = others[others['congress_id'] != '115'].copy()

prev_cong.loc[:, 'labels'] = 0

print(current_cong.shape, prev_cong.shape)

In [None]:
# let's label To President and Resolving Differences with 1. Everything else is on the floor
to_pres = current_cong[(current_cong['bill_status'] == 'To President') | (current_cong['bill_status'] == 'Resolving Differences')].copy()
on_floor = current_cong[(current_cong['bill_status'] != 'To President') & (current_cong['bill_status'] != 'Resolving Differences')].copy()

to_pres.loc[:, 'labels'] = 1

print(to_pres.shape, on_floor.shape)
