In [1]:
import numpy as np
import pandas as pd
from pymongo import MongoClient
import pprint

In [2]:
client = MongoClient() # defaults to localhost
db = client.bills
bill_details = db.bill_details


In [3]:
bill_details.find().count()

  """Entry point for launching an IPython kernel.


253000

In [4]:
# print out record counts with text
print('--> Number of records in database: {}'.format(bill_details.find().count()))

records_with_text = bill_details.find({'body': {'$regex': 'e'}})
record_count = records_with_text.count()
print('--> Current number of records with text: {}'.format(record_count))


  
  """


--> Number of records in database: 253000
--> Current number of records with text: 11738


In [5]:
# need to execute query everytime i refer to it?
records_with_text = bill_details.find({'body': {'$regex': 'e'}})
data = pd.DataFrame(list(records_with_text))

In [6]:
data.leg_type.unique()

array(['BILL', 'LAW', 'JOINT RESOLUTION', 'CONCURRENT RESOLUTION',
       'RESOLUTION', 'AMENDMENT'], dtype=object)

In [7]:
# filter out simple resolutions, concurrent resolutions, and amendments (for prelim model)
data = data[(data['leg_type'] != 'RESOLUTION') & (data['leg_type'] != 'CONCURRENT RESOLUTION') & (data['leg_type'] != 'AMENDMENT')]

In [8]:
data.bill_status.unique()

array(['Introduced', 'Became Law', 'Passed House', 'To President',
       'Resolving Differences', 'Failed House', 'Became Private Law',
       'Passed Senate', 'Failed to pass over veto', 'Vetoed by President',
       'Passed over veto', 'Pocket vetoed by President', 'Failed Senate'],
      dtype=object)

In [9]:

# LABELS

# Every record that doesn't have status Became Law will have label 0 if before current (115th) congress.
# Try this out with 3 labels.

#  

#                             Whole     House     Senate
# Introduced:                 None      None      None
# Became Law:                 1         1         1
# Passed House:               None      1         None
# To President:               1         1         1
# Resolving Differences:      1         1         1
# Failed House:               0         0         1 if S
# Became Private Law:         1         1         1
# Passed Senate:              None      None      1
# Failed to pass over veto:   1         1         1
# Vetoed by President:        1         1         1
# Passed over veto:           1         1         1     #stronger support for this one???
# Pocket vetoed by President: 1         1         1
# Failed Senate:              0         1 if H    0


In [10]:
# check numbers for each status
for i in data.bill_status.unique():
    num = len(data[data['bill_status'] == i])
    print('{}: \t\t{}'.format(i, num))

Introduced: 		6370
Became Law: 		1439
Passed House: 		2140
To President: 		21
Resolving Differences: 		72
Failed House: 		114
Became Private Law: 		2
Passed Senate: 		42
Failed to pass over veto: 		34
Vetoed by President: 		27
Passed over veto: 		1
Pocket vetoed by President: 		5
Failed Senate: 		1


In [11]:
# create columns for labels
# data['house_label'] = None
# data['senate_label'] = None
# data['president_label'] = None
data['passed'] = None

In [12]:
# # start breaking up the labels after first iteration
# introduced = data[data['bill_status'] == 'Introduced']
# became_law = data[data['bill_status'] == 'Became Law']
# passed_house = data[data['bill_status'] == 'Passed House']
# to_pres = data[data['bill_status'] == 'To President']
# res_diff = data[data['bill_status'] == 'Resolving Differences']
# failed_house = data[data['bill_status'] == 'Failed House']
# became_priv = data[data['bill_status'] == 'Became Private Law']
# passed_senate = data[data['bill_status'] == 'Passed Senate']
# failed_over_veto = data[data['bill_status'] == 'Failed to pass over veto']
# veto_pres = data[data['bill_status'] == 'Vetoed by President']
# passed_over_veto = data[data['bill_status'] == 'Passed over veto']   #more weight to this one?
# pocket_veto = data[data['bill_status'] == 'Pocket vetoed by President']  #this is a pass
# failed_senate = data[data['bill_status'] == 'Failed Senate']


# # all labels on became_law will be 1
# became_law.loc[:, 'house_label'] = 1
# became_law.loc[:, 'senate_label'] = 1
# became_law.loc[:, 'president_label'] = 1
# became_law.loc[:, 'passed'] = 1

# # labels on veto_pres will be 1, 1, 0
# veto_pres.loc[:, 'house_label'] = 1
# veto_pres.loc[:, 'senate_label'] = 1
# veto_pres.loc[:, 'president_label'] = 0



In [103]:
orig_shape = data.shape
print('Shape of entire data before labeling: {}'.format(orig_shape))

Shape of entire data before labeling: (10268, 18)


In [18]:
# break up dataframe into those that became law and others (did not or still pending)
became_law = data[(data['bill_status'] == 'Became Law') | (data['bill_status'] == 'Became Private Law')]
others = data[(data['bill_status'] != 'Became Law') & (data['bill_status'] != 'Became Private Law')]

became_law.loc[:, 'passed'] = 1


# break up others into current congress and previous ones. Anything that hasn't been signed into law
# before current session is dead.
current_cong = others[others['congress_id'] == '115th']
prev_cong = others[others['congress_id'] != '115th']

prev_cong.loc[:, 'passed'] = 0


# let's label To President and Resolving Differences with 1
to_pres = current_cong[(current_cong['bill_status'] == 'To President') | (current_cong['bill_status'] == 'Resolving Differences')]
on_floor = current_cong[(current_cong['bill_status'] != 'To President') & (current_cong['bill_status'] != 'Resolving Differences')]

to_pres.loc[:, 'passed'] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [49]:
failed = on_floor[on_floor['bill_status'].str.startswith('Failed')]
not_failed = on_floor[~on_floor['bill_status'].str.startswith('Failed')]

failed.loc[:, 'passed'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [97]:
introduced = not_failed[not_failed['bill_status'] == 'Introduced']
beyond_intro = not_failed[not_failed['bill_status'] != 'Introduced']

introduced.loc[:, 'passed'] = 'in_progress'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [99]:
passed_opp_chamber = beyond_intro[(beyond_intro['bill_status'] == 'Passed House') & (beyond_intro['leg_id'].str.startswith('S')) | 
                          (beyond_intro['bill_status'] == 'Passed Senate') & (beyond_intro['leg_id'].str.startswith('H'))]

passed_opp_chamber.loc[:, 'passed'] = 1


in_orig_chamber = beyond_intro[(beyond_intro['bill_status'] == 'Passed House') & (beyond_intro['leg_id'].str.startswith('H')) | 
                          (beyond_intro['bill_status'] == 'Passed Senate') & (beyond_intro['leg_id'].str.startswith('S'))]    

in_orig_chamber.loc[:, 'passed'] = 'in_progress'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [101]:
data_l = pd.concat([became_law, prev_cong, to_pres, failed, introduced, in_opp_chamber, in_orig_chamber])

In [105]:
labeled_shape = data_l.shape
print('Shape of entire data after labeling: {}'.format(labeled_shape))
print('----------------')

if orig_shape == labeled_shape:
    print('\tNo data loss upon labeling. Continue on your chosen path. DATA SCIENCE YULES!!!')
else:
    print('\tData loss occurred during labeling. Examine your code')    


Shape of entire data after labeling: (10268, 18)
----------------
	No data loss upon labeling. Continue on your chosen path. DATA SCIENCE YULES!!!


In [106]:
data_l.head()

Unnamed: 0,_id,bill_status,body,committee,congress_id,cosponsors,cosponsors_url,desc,intro_date,leg_id,leg_type,leg_url,num_of_cosponsors,sponsor,sponsor_district,sponsor_party,sponsor_state,passed
24,5c11d642cd68d16918e58ec1,Became Law,[Congressional Bills 115th Congress] [From the...,House - Financial Services,115th,,,To extend the National Flood Insurance Program...,11/29/2018,H R 7187,LAW,https://www.congress.gov/bill/115th-congress/h...,0,"Rep. MacArthur, Thomas",3,NJ,R,1
314,5c11d643cd68d16918e58fe3,Became Law,[Congressional Bills 115th Congress] [From the...,"House - Transportation and Infrastructure, Way...",115th,,,"Airport and Airway Extension Act of 2018, Part II",09/26/2018,H R 6897,LAW,https://www.congress.gov/bill/115th-congress/h...,0,"Rep. Shuster, Bill",9,PA,R,1
315,5c11d643cd68d16918e58fe4,Became Law,[Congressional Bills 115th Congress] [From the...,House - Judiciary | Senate - Judiciary,115th,,https://www.congress.gov/bill/115th-congress/h...,United States Parole Commission Extension Act ...,09/26/2018,H R 6896,LAW,https://www.congress.gov/bill/115th-congress/h...,1,"Rep. Sensenbrenner, F. James, Jr.",5,WI,R,1
341,5c11d643cd68d16918e58ffe,Became Law,[115th Congress Public Law 277] [From the U.S....,"House - Oversight and Government Reform, Finan...",115th,,,To rename the Stop Trading on Congressional Kn...,09/25/2018,H R 6870,LAW,https://www.congress.gov/bill/115th-congress/h...,0,"Rep. Tonko, Paul",20,NY,D,1
453,5c11d643cd68d16918e5906e,Became Law,[Congressional Bills 115th Congress] [From the...,House - Judiciary,115th,,https://www.congress.gov/bill/115th-congress/h...,SUCCESS Act,09/10/2018,H R 6758,LAW,https://www.congress.gov/bill/115th-congress/h...,10,"Rep. Chabot, Steve",1,OH,R,1


In [107]:
data_l.passed.value_counts()

in_progress    7118
0              1676
1              1467
Name: passed, dtype: int64