In [1]:
# %%writefile prediction_pipeline.py
from pymongo import MongoClient
from my_tools import get_bill_data, read_jsonl_file, process_corpus
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from sklearn.externals import joblib

import matplotlib.pyplot as plt

# initialize Mongo client
client = MongoClient()
db = client.bills
predictions = db.predictions

print('---------------')
print('Loading new data...')
data, in_progress = get_bill_data()

in_progress.head(3)

------------------
------------------
Data includes bills, joints resolutions, and laws with text from the 110th Congress (2007) to present
Make changes in my_tools.get_bill_data to modify the data set.
------------------


Unnamed: 0,_id,bill_status,body,committee,congress_id,cosponsors,cosponsors_url,desc,intro_date,leg_id,...,sponsor,sponsor_district,sponsor_party,sponsor_state,bill_char_counts,intro_month,intro_year,session,char_count_bucket,labels
0,5c2fb9b81417de116c271892,Introduced,[Congressional Bills 116th Congress] [From the...,House - Small Business,116,,https://www.congress.gov/bill/116th-congress/h...,To clarify the primary functions and duties of...,2019-01-03,H R 128,...,"Comer, James",1,R,KY,1913,1,2019,1,1001 - 2000,in_progress
1,5c2fb9ca1417de116c271910,Introduced,[Congressional Bills 116th Congress] [From the...,House - Judiciary,116,,https://www.congress.gov/bill/116th-congress/h...,Proposing an amendment to the Constitution of ...,2019-01-03,H J Res 4,...,"Arrington, Jodey C.",19,R,TX,2144,1,2019,1,2001 - 3000,in_progress
2,5c2fb9ca1417de116c271913,Passed House,[Congressional Bills 116th Congress] [From the...,House - Appropriations,116,,,Making further continuing appropriations for t...,2019-01-03,H J Res 1,...,"Lowey, Nita M.",17,D,NY,4832,1,2019,1,4001 - 5000,in_progress


In [2]:
print('---------------')
print('Loading pickled vectorizer and classifier...')
vectorizer = joblib.load('pickle_files/tfidfVectorizer.pkl')
classifier = joblib.load('pickle_files/nlp_gradientBoost.pkl')

---------------
Loading pickled vectorizer and classifier...


  from numpy.core.umath_tests import inner1d


In [3]:
print('---------------')
print('Preprocessing bill text...')
corpus = process_corpus(in_progress, 'body')

print('---------------')
print('Vectorizing bill text...')
corpus_vec = vectorizer.transform(corpus)

print('---------------')
print('Calculating predicted probabilities...')
y_pred = classifier.predict(corpus_vec)
y_pred_proba = classifier.predict_proba(corpus_vec)

---------------
Preprocessing bill text...
------------------
Step 1 of 4: Creating corpus...
------------------
Step 2 of 4: Tokenizing...
------------------
Step 3 of 4: Stripping out stop words, punctuation, and numbers...
------------------
Step 4 of 4: Lemmatizing...
------------------
NLP preprocessing complete ...
---------------
Vectorizing bill text...
---------------
Calculating predicted probabilities...


In [4]:
y_pred_proba

array([[9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.99862783e-01, 1.37217337e-04],
       [9.997809

In [15]:
corpus[6]

"h.j re doc congress session j. re proposing balanced budget amendment constitution united state representative january byrne submitted following joint resolution referred committee judiciary joint resolution proposing balanced budget amendment constitution united state resolved representative united state america congress assembled two-thirds concurring therein following article proposed amendment constitution united state valid intent purpose part constitution ratified legislature three-fourths several state within seven year date submission ratification `` article `` section total outlay fiscal exceed total receipt fiscal unless three-fifths whole number congress provide law specific excess outlay receipt rollcall vote `` section total outlay fiscal exceed one-fifth economic output united state unless two-thirds congress provide specific increase outlay amount `` section limit debt united state held public increased unless three-fifths whole number provide law increase rollcall vote

In [None]:
# the code below needs to be a function
# def train_test_split_by_feature(df, split_on, stratify):
df_list = []

for c in range(110, 115):
    cong_id = str(c)
    sub_df = data[data['congress_id'] == cong_id].copy()
    df_list.append(sub_df)

# initialize dataframes, see if it preserves index
X_train_df = pd.DataFrame()
X_test_df = pd.DataFrame()
y_train_df = pd.DataFrame()
y_test_df = pd.DataFrame()


for df in df_list:
    X = df['body']
    y = df['labels']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    
    X_train_df = pd.concat([X_train_df, X_train])
    X_test_df = pd.concat([X_test_df, X_test])
    y_train_df = pd.concat([y_train_df, y_train])
    y_test_df = pd.concat([y_test_df, y_test])
    
X_train_df.drop_duplicates(inplace=True)
X_test_df.drop_duplicates(inplace=True)
y_train_df.drop_duplicates(inplace=True)
y_test_df.drop_duplicates(inplace=True)

In [None]:
X_train_df.shape

In [None]:
X_test_df.shape

In [None]:
X_train_df.shape[0] + X_test_df.shape[0]

In [None]:
y_train_df[0].value_counts()

In [None]:
y_test_df[0].value_counts()

In [None]:
data['labels'].value_counts()

In [None]:
X_train.shape[0] + X_test.shape[0]

In [None]:
y_train.shape[0] + y_test.shape[0]

In [None]:
data.shape[0]

In [None]:
X = process_corpus(in_progress, 'body')

In [None]:
X[0]

In [None]:
in_progress.bill_status.unique()

In [None]:
len(X)

In [None]:
print('---------------')
print('Vectorizing corpus and fitting to classifier...')
X_vec = vectorizer.transform(X)

y_pred = classifier.predict(X_vec)
y_pred_proba = classifier.predict_proba(X_vec)


In [None]:
y_pred_proba[:, 1]

In [None]:
in_progress['pred_proba'] = y_pred_proba[:, 1]

In [None]:
in_progress.head()

In [None]:
in_progress.pred_proba.unique()

In [None]:
leg_id = 'H R 5759'

In [None]:
data[(data['congress_id'] == '115') & (data['leg_id'] == leg_id)]

In [None]:
in_progress[in_progress['leg_id'] == leg_id]

In [None]:
def plot_scores(x, y_list, name, labels):
    x = x
    fig = plt.figure(figsize = (16, 8))
    ax = fig.add_subplot(111)
    ax.set_title(name, fontdict = {'fontsize': 20})
    for y in y_list:
        plt.plot(x, y)
    plt.legend(labels)

    plt.show

In [None]:
a_list = []
r_list = []
p_list = []

for i in range(1, 21):
    print('***************')')
    print('Iteration {}'.format(i))

    print('---------------')
    print('Vectorizing corpus and fitting to classifier...')
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    X_test_vec = vectorizer.transform(X_test)

    y_pred = classifier.predict(X_test_vec)


    r_score = recall_score(y_test, y_pred)
    p_score = precision_score(y_test, y_pred)
    a_score = accuracy_score(y_test, y_pred)

    print('Recall Score:\t\t{:.4f}'.format(r_score))
    print('Precision Score:\t{:.4f}'.format(p_score))
    print('Accuracy Score:\t\t{:.4f}'.format(a_score))

    print('---------------')
    print('Appending scores.')
    r_list.append(r_score)
    p_list.append(p_score)
    a_list.append(a_score)



In [None]:
# show graph of scores vs. number of max_features 
x = np.arange(1, 11)
plot_list = [a_list, r_list, p_list]
graph_name = 'Recall, Precision, and Accuracy Scores over Multiple Iteration of Pickled Model'
labels = ['accuracy score', 'recall score', 'precision score']

plot_scores(x, plot_list, graph_name, labels)