In [1]:
from pymongo import MongoClient
import numpy as np
from my_tools import get_bill_data, process_corpus, write_json_file, write_jsonl_file
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score

from sklearn.externals import joblib


  from numpy.core.umath_tests import inner1d


In [2]:
data, in_progress = get_bill_data()

------------------
------------------
Data includes bills, joints resolutions, and laws with text from the 110th Congress (2007) to present
Make changes in my_tools.get_bill_data to modify the data set.
------------------


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52366 entries, 0 to 52365
Data columns (total 24 columns):
_id                  52366 non-null object
bill_status          52366 non-null object
body                 52366 non-null object
committee            51870 non-null object
congress_id          52366 non-null object
cosponsors           0 non-null object
cosponsors_url       37342 non-null object
desc                 52366 non-null object
intro_date           52366 non-null datetime64[ns]
leg_id               52366 non-null object
leg_type             52366 non-null object
leg_url              52366 non-null object
num_of_amendments    52366 non-null int64
num_of_cosponsors    52366 non-null int64
sponsor              52366 non-null object
sponsor_district     33908 non-null object
sponsor_party        52366 non-null object
sponsor_state        52366 non-null object
bill_char_counts     52366 non-null int64
intro_month          52366 non-null int64
intro_year           52366 non-

In [4]:
data.head()

Unnamed: 0,_id,bill_status,body,committee,congress_id,cosponsors,cosponsors_url,desc,intro_date,leg_id,...,sponsor,sponsor_district,sponsor_party,sponsor_state,bill_char_counts,intro_month,intro_year,session,char_count_bucket,labels
0,5c2d51551417de4b3aaa8d87,Became Law,[Congressional Bills 115th Congress] [From the...,House - Small Business | Senate - Small Busine...,115,,https://www.congress.gov/bill/115th-congress/h...,Small Business Runway Extension Act of 2018,2018-07-11,H R 6330,...,"Knight, Stephen",25,R,CA,981,7,2018,2,less than 1000,1
1,5c2d51bd1417de4b3aaa8dce,Became Law,[Congressional Bills 115th Congress] [From the...,House - Judiciary | Senate - Judiciary,115,,https://www.congress.gov/bill/115th-congress/h...,Protecting Access to the Courts for Taxpayers Act,2017-10-10,H R 3996,...,"Issa, Darrell E.",49,R,CA,990,10,2017,1,less than 1000,1
2,5c2d522d1417de4b3aaa8e3a,Became Law,[Congressional Bills 115th Congress] [From the...,House - Natural Resources | Senate - Indian Af...,115,,https://www.congress.gov/bill/115th-congress/h...,"To repeal the Act entitled ""An Act to confer j...",2017-02-15,H R 1074,...,"Blum, Rod",1,R,IA,927,2,2017,1,less than 1000,1
3,5c2d52361417de4b3aaa8e41,Became Law,[115th Congress Public Law 111] [From the U.S....,House - Natural Resources | Senate - Energy an...,115,,,To facilitate the addition of park administrat...,2017-02-03,H R 863,...,"Larson, John B.",1,D,CT,948,2,2017,1,less than 1000,1
4,5c2d8f671417de4ef25de42c,Became Law,[115th Congress Public Law 74] [From the U.S. ...,House - Financial Services,115,,https://www.congress.gov/bill/115th-congress/h...,Providing for congressional disapproval under ...,2017-07-20,H J Res 111,...,"Rothfus, Keith J.",12,R,PA,965,7,2017,1,less than 1000,1


In [5]:
data.session.value_counts()

1    33862
2    18504
Name: session, dtype: int64

In [6]:
in_progress.session.value_counts()

1    6849
2    4100
Name: session, dtype: int64

In [None]:
last_month = data[(data['congress_id'] == '115') & (data['session'] == 2)]

last_month.labels.value_counts()

In [None]:
# output corpus to eliminate multiple preprocessing events.
outfile_path = '../data/nlp/corpus_with_labels.jsonl'
#reset file
write_jsonl_file([''], outfile_path)
for i in range(len(X)):
    output = {'document': X[i], 'label': str(y[i])}
    write_json_file(output, outfile_path)

In [None]:
# create stratified train-test split
print('-------------------')
print('Doing train-test split...')
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)#, random_state = 123)

print('-------------------')
print('Vectorizing...')
tfvect = TfidfVectorizer(ngram_range=(1, 5), max_features=1000000)
X_train_vec = tfvect.fit_transform(X_train)
X_test_vec = tfvect.transform(X_test)

# create vocabulary
vocab = tfvect.vocabulary_

In [None]:
# dump the TfidfVectorizer
print('Pickling the TfidfVectorizer...')
joblib.dump(tfvect, 'pickle_files/tfidfVectorizer.pkl')

In [None]:
print('Training Random Forest Classifier with vectorized results...')
rf = RandomForestClassifier(n_jobs = -1)
rf.fit(X_train_vec, y_train)

In [None]:
# dump the RandomForest Classifier
print('Pickling the Random Forest Classifier...')
joblib.dump(rf, 'pickle_files/nlp_randomForest.pkl')

In [None]:
rf_y_pred = rf.predict(X_test_vec)
rf_y_pred_proba = rf.predict_proba(X_test_vec)

In [None]:
print('Recall Score:\t\t{:.4f}'.format(recall_score(y_test, rf_y_pred)))
print('Precision Score:\t{:.4f}'.format(precision_score(y_test, rf_y_pred)))
print('Accuracy Score:\t\t{:.4f}'.format(accuracy_score(y_test, rf_y_pred)))

In [None]:
# rf_y_pred_proba[0]

In [None]:
# Plot the feature importance
feat_scores = pd.Series(rf.feature_importances_,
                           index=vocab)

feat_scores = feat_scores.sort_values()[::-1][:50][::-1]
ax = feat_scores.plot(kind='barh', 
                      figsize=(10,8),
                      color='b')
ax.set_title('Average Gini Importance (Top 50 features)')
ax.set_xlabel('Average contribution to information gain')

In [None]:
# my recall scores seem abnormally high... let's iterate through a few different slices to build 
# confidence in our results
recall_scores = []

i = 1
while i < 6:
    print('-------------')
    print('Performing train-test split and vectorizing for iteration {}...'.format(i))
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y)
    
    tfvect = TfidfVectorizer(ngram_range=(1, 5))
    X_train_vec = tfvect.fit_transform(X_train)
    X_test_vec = tfvect.transform(X_test)
    
    print('\tFitting data to Random Forest Classifier...')
    rf = RandomForestClassifier(max_features = 'auto', n_jobs = -1)
    rf.fit(X_train, y_train)

    rf_y_pred = rf.predict(X_test)
    recall_scores.append(recall_score(y_test, rf_y_pred))
    
    i+=1
    

print('Mean recall score after {} iterations: {}'.format(i, np.mean(recall_scores)))