In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from my_tools import get_bill_data, process_corpus
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB#, ComplementNB unreleased as of 12/14
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score, confusion_matrix


In [None]:
client = MongoClient()
db = client.bills
bill_info = db.bill_info

# monitoring progress of data into Mongo
print('Number of documents in database:\t{}'.format(bill_info.count_documents({})))

records_with_text = bill_info.count_documents({'body': {'$regex': '(.+)'}})
print('Documents with bill text:\t\t{}'.format(records_with_text))

records_wo_text = bill_info.count_documents({'body': None})
print('Documents without bill text:\t\t{}'.format(records_wo_text))

records_with_amend_count = bill_info.count_documents({'num_of_amendments': {'$regex': '(.+)'}})
print('Documents with amend count:\t\t{}'.format(records_with_amend_count))

records_wo_amend_count = bill_info.count_documents({'num_of_amendments': None})
print('Documents without amend count:\t\t{}'.format(records_wo_amend_count))


In [None]:
print('Amendment Counts')
print('cong_id\tWith\tWithout')

for i in range(110, 116):
    cong_id = str(i)
    with_amend = bill_info.count_documents({'congress_id': cong_id, 'num_of_amendments': {'$regex': '(.+)'}})
    wo_amend = bill_info.count_documents({'congress_id': cong_id, 'num_of_amendments': None})
    print('{}: \t{} \t{}'.format(cong_id, with_amend, wo_amend))
    

In [None]:
data = get_bill_data()



In [None]:
data.head()

In [None]:
data.info()

In [None]:
X, y = process_corpus(data, 'body')

In [None]:
# create stratified train-test split
print('-------------------')
print('Doing train-test split...')
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 123)

In [None]:
print('-------------------')
print('Training...')
tfvect = TfidfVectorizer(ngram_range=(1, 4))
X_train_vec = tfvect.fit_transform(X_train)
X_test_vec = tfvect.transform(X_test)

In [None]:
vocab = tfvect.vocabulary_

In [None]:
vocab

In [None]:
X_train_vec.shape

In [None]:
rf = RandomForestClassifier(n_jobs = -1)
rf.fit(X_train_vec, y_train)

In [None]:
rf_y_pred = rf.predict(X_test_vec)

In [None]:
print(recall_score(y_test, rf_y_pred))
print(precision_score(y_test, rf_y_pred))
print(accuracy_score(y_test, rf_y_pred))

In [None]:
confusion_matrix(y_test, rf_y_pred)

In [None]:
# Plot the feature importance
feat_scores = pd.Series(rf.feature_importances_,
                           index=vocab)
feat_scores = feat_scores.sort_values()[::-1][:20][::-1]
ax = feat_scores.plot(kind='barh', 
                      figsize=(10,8),
                      color='b')
ax.set_title('Average Gini Importance (Top 20 features)')
ax.set_xlabel('Average contribution to information gain')