In [6]:
%%writefile make_predictions.py
'''
----------------------------------------------
This script loads the data and pickled models to make predictions on the bills that are still
in progress. It is currently using sklearn's TfidfVectorizer and Random Forest to vectorize
the text and make the predictions. The results are then loaded into Mongo collection predictions
for use in the Flask app.

----------------------------------------------
'''
import numpy as np
import pandas as pd
from pymongo import MongoClient
from my_tools import get_bill_data, process_corpus
from sklearn.externals import joblib

from datetime import datetime, date
import matplotlib.pyplot as plt

# specify weight of nlp portion of model
nlp_weight = .5

# initialize Mongo client and collections
client = MongoClient()
db = client.bills
predictions = db.predictions
prev_predictions = db.prev_predictions

print('---------------')
print('---------------')
print('Loading new data to make predictions...')
data, in_progress = get_bill_data()


# Load pickled TfidfVectorizer and Random Forest Classifier
print('---------------')
print('Loading pickled models...')
print('---------------')
print('\t... Vectorizer for NLP...')
vectorizer = joblib.load('pickle_files/tfidfVectorizer.pkl')
print('---------------')
print('\t... Classifier for NLP...')
rf = joblib.load('pickle_files/nlp_randomForest.pkl')
print('---------------')
print('\t... Scaler for numerical model...')
sc = joblib.load('pickle_files/num_scaler.pkl')
print('---------------')
print('\t... Classifier for numerical model...')
gb = joblib.load('pickle_files/num_gradientBoost.pkl')
print('Pickled models loaded.')



# Put bill text from bills still in progress through the nlp pipeline
print('---------------')
print('Preprocessing bill text...')
corpus = process_corpus(in_progress, 'bill_text')



# Vectorize the text for modeling
print('---------------')
print('Vectorizing bill text...')
corpus_vec = vectorizer.transform(corpus)

print('---------------')
print('Calculating predicted probabilities for nlp portion of model...')
nlp_pred_proba = rf.predict_proba(corpus_vec)[:, 1]

# add probabilities to dataframe 
in_progress['nlp_pred_proba'] = nlp_pred_proba


print('------------------')
print('Fitting numerical data...')
# the numerical model was trained on bills that progressed beyond the introduction stage
# break this data out of the dataframe and merge them after predictions are made
intro = in_progress[in_progress['bill_status'] == 'Introduced']
beyond_intro = in_progress[in_progress['bill_status'] != 'Introduced']

# data to fit must have the same features as the data used to train the model
model_cols = [
            'num_of_cosponsors', 
            'num_of_amendments', 
            'bill_char_counts',
            'intro_month_1', 
            'intro_month_2', 
            'intro_month_3', 
            'intro_month_4', 
            'intro_month_5', 
            'intro_month_6', 
            'intro_month_7', 
            'intro_month_8', 
            'intro_month_9', 
            'intro_month_10', 
            'intro_month_11',
            'session_1'
            ]

cols_to_use = [
#             'sponsor',
            'num_of_cosponsors', 
#             'sponsor_party', 
#             'sponsor_state', 
            'num_of_amendments',
            'bill_char_counts', 
            'intro_month', 
            'session'
            ]

dummy_columns = [
            'intro_month', 
#             'num_of_amendments', 
#             'num_of_cosponsors',
#             'sponsor',
#             'sponsor_party', 
#             'sponsor_state', 
            'session'
            ]

data_feats = beyond_intro.loc[:, cols_to_use]

# get dummies for intro_month, sponsor_party, sponsor_state, session
data_dumm = pd.get_dummies(data_feats, columns = dummy_columns, drop_first=False)


# modify columns to fit model
for col in model_cols:
    if col not in data_dumm.columns:
        data_dumm[col] = 0



print('-------------------')
print('Scaling and getting predictions...')
data_dumm = sc.transform(data_dumm)
gb_pred_proba = gb.predict_proba(data_dumm)[:, 1]


beyond_intro['num_pred_proba'] = gb_pred_proba
intro['num_pred_proba'] = .05

pred_df = pd.concat([intro, beyond_intro], axis = 0)

pred_df['pred_proba'] = nlp_weight * pred_df['nlp_pred_proba'] + (1 - nlp_weight) * pred_df['num_pred_proba']

pred_df['pred_date'] = date.today().strftime('%m/%d/%Y')

print('---------------')
print('Formatting and inserting the predicted probabilities into Mongo...')
# format columns for flask app
pred_df['intro_date'] = pred_df['intro_date'].apply(lambda x: x.strftime('%m/%d/%Y'))
pred_df['nlp_pred_proba'] = pred_df['nlp_pred_proba'].round(5)
pred_df['num_pred_proba'] = pred_df['num_pred_proba'].round(5)
pred_df['pred_proba'] = pred_df['pred_proba'].round(5)

# drop id_, new collection will add a new id_
pred_df.drop('_id', axis = 1, inplace = True)


# db.prev_predictions.drop()

# move previous predictions to prev_predictions
priors = predictions.find()

# db.prev_predictions.drop()
for p in priors:
    prev_predictions.insert_one(p)

# replace old predictions with new ones for Flask app
db.predictions.drop()
predictions.insert_many(pred_df.to_dict('records'))

print('---------------')
print('Loaded {} predictions. Script complete. DATA SCIENCE!!!'.format(len(pred_df)))

Overwriting make_predictions.py


In [4]:
# exploration
from pymongo import MongoClient
# initialize Mongo client
client = MongoClient()
db = client.bills
predictions = db.predictions
prev_predictions = db.prev_predictions
bill_info = db.bill_info

priors = predictions.find()

In [3]:
bill_info.find_one({'leg_id': 'H R 423', 'congress_id': '116'})

{'_id': ObjectId('5c379e801417de0a6dd6af53'),
 'leg_id': 'H R 423',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/116th-congress/house-bill/423?s=1&r=1',
 'intro_date': '01/09/2019',
 'congress_id': '116',
 'desc': 'For the relief of Robert Feiss.',
 'sponsor': 'Brownley, Julia',
 'sponsor_party': 'D',
 'sponsor_state': 'CA',
 'sponsor_district': '26',
 'num_of_cosponsors': '0',
 'cosponsors_url': None,
 'cosponsors': None,
 'num_of_amendments': '0',
 'committee': 'House - Judiciary',
 'bill_status': 'Introduced',
 'body': "[Congressional Bills 116th Congress] [From the U.S. Government Publishing Office] [H.R. 423 Introduced in House (IH)] <DOC> 116th CONGRESS 1st Session H. R. 423 For the relief of Robert Feiss. _______________________________________________________________________ IN THE HOUSE OF REPRESENTATIVES January 9, 2019 Ms. Brownley of California introduced the following bill; which was referred to the Committee on the Judiciary ____________________________

In [29]:
prev_predictions.find_one()

{'_id': ObjectId('5c377f011417de079ba6b36e'),
 'bill_status': 'Introduced',
 'bill_text': "Proposing a balanced budget amendment to the Constitution of the United States. Resolved by the Senate and House of Representatives of the United States of America in Congress assembled (two-thirds of each House concurring therein), That the following article is proposed as an amendment to the Constitution of the United States, which shall be valid to all intents and purposes as part of the Constitution when ratified by the legislatures of three-fourths of the several States within seven years after the date of its submission for ratification: ``Article-- ``Section 1. Total outlays for any fiscal year shall not exceed total receipts for that fiscal year, unless three-fifths of the whole number of each House of Congress shall provide by law for a specific excess of outlays over receipts by a rollcall vote. ``Section 2. The limit on the debt of the United States held by the public shall not be incr