In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from my_tools import get_bill_data, process_corpus
import matplotlib.pyplot as plt

In [2]:
data = get_bill_data()

------------------
------------------
Data includes bills, joints resolutions, and laws with text from the 110th Congress (2007) to present
Make changes in my_tools.get_bill_data to modify the data set.
------------------


In [3]:
client = MongoClient()
db = client.bills
bill_info = db.bill_info

In [4]:
print('Amendment Counts')
print('cong_id\tWith\tWithout')

for i in range(110, 116):
    cong_id = str(i)
    with_amend = bill_info.count_documents({'congress_id': cong_id, 'num_of_amendments': {'$regex': '(.+)'}})
    wo_amend = bill_info.count_documents({'congress_id': cong_id, 'num_of_amendments': None})
    print('{}: \t{} \t{}'.format(cong_id, with_amend, wo_amend))
    

Amendment Counts
cong_id	With	Without
110: 	4538 	6686
111: 	7129 	3641
112: 	9437 	1175
113: 	8798 	285
114: 	10223 	0
115: 	11412 	0


In [5]:
data.iloc[:3, :10]

Unnamed: 0,_id,bill_status,body,committee,congress_id,cosponsors,cosponsors_url,desc,intro_date,leg_id
0,5c2d51551417de4b3aaa8d87,Became Law,[Congressional Bills 115th Congress] [From the...,House - Small Business | Senate - Small Busine...,115,,https://www.congress.gov/bill/115th-congress/h...,Small Business Runway Extension Act of 2018,2018-07-11,H R 6330
1,5c2d51bd1417de4b3aaa8dce,Became Law,[Congressional Bills 115th Congress] [From the...,House - Judiciary | Senate - Judiciary,115,,https://www.congress.gov/bill/115th-congress/h...,Protecting Access to the Courts for Taxpayers Act,2017-10-10,H R 3996
2,5c2d522d1417de4b3aaa8e3a,Became Law,[Congressional Bills 115th Congress] [From the...,House - Natural Resources | Senate - Indian Af...,115,,https://www.congress.gov/bill/115th-congress/h...,"To repeal the Act entitled ""An Act to confer j...",2017-02-15,H R 1074


In [6]:
data.iloc[:3, 10:]

Unnamed: 0,leg_type,leg_url,num_of_amendments,num_of_cosponsors,sponsor,sponsor_district,sponsor_party,sponsor_state,bill_char_counts,intro_month,session,char_count_bucket,labels
0,LAW,https://www.congress.gov/bill/115th-congress/h...,0.0,6,"Knight, Stephen",25,R,CA,981,7,1,less than 1000,1
1,LAW,https://www.congress.gov/bill/115th-congress/h...,0.0,2,"Issa, Darrell E.",49,R,CA,990,10,1,less than 1000,1
2,LAW,https://www.congress.gov/bill/115th-congress/h...,0.0,3,"Blum, Rod",1,R,IA,927,2,1,less than 1000,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52366 entries, 0 to 52365
Data columns (total 23 columns):
_id                  52366 non-null object
bill_status          52366 non-null object
body                 52366 non-null object
committee            51870 non-null object
congress_id          52366 non-null object
cosponsors           0 non-null object
cosponsors_url       37342 non-null object
desc                 52366 non-null object
intro_date           52366 non-null datetime64[ns]
leg_id               52366 non-null object
leg_type             52366 non-null object
leg_url              52366 non-null object
num_of_amendments    40580 non-null float64
num_of_cosponsors    52366 non-null int64
sponsor              52366 non-null object
sponsor_district     33908 non-null object
sponsor_party        52366 non-null object
sponsor_state        52366 non-null object
bill_char_counts     52366 non-null int64
intro_month          52366 non-null int64
session              52366 no

In [9]:
data.labels.value_counts()

0    50162
1     2204
Name: labels, dtype: int64

In [19]:
data_features = data.loc[:, [
                            'sponsor',
                            'num_of_amendments'
                            'num_of_cosponsors', 
                            'sponsor_party', 
                            'sponsor_state', 
#                             'bill_char_counts', 
                            'char_count_bucket',
                            'intro_month', 
                            'session', 
                            'labels'
                            ]]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [20]:
# get dummies for intro_month, sponsor_party, sponsor_state, session
data_dumm = pd.get_dummies(data_features, columns = [
                                            'sponsor',
                                            'intro_month', 
                                            'sponsor_party', 
                                            'sponsor_state', 
                                            'session', 
                                            'char_count_bucket'
                                            ], 
                           drop_first=True)

In [21]:
data_dumm.shape

(52366, 1033)