In [1]:
# %%writefile random_forest_model.py
import numpy as np
import pandas as pd
from pymongo import MongoClient
import pprint
import string
import re
import datetime
import copy

from my_tools import get_bill_data

import statsmodels.api as sm
import scipy.stats as scs
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from sklearn.metrics import recall_score, precision_score, accuracy_score, confusion_matrix


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB#, ComplementNB unreleased as of 12/14

import matplotlib.pyplot as plt
plt.style.use('ggplot')

  from numpy.core.umath_tests import inner1d


In [2]:
data = get_bill_data()

------------------
------------------
Data is from the 110th Congress (2007) to present
Alter masking in my_tools.get_bill_data to get a different data set.
------------------


In [None]:
passed_df = data[data['labels'] == 1]

fig = plt.figure(figsize = (30, 12))
ax = fig.add_subplot(111)
ax.set_title('Number of Bills Introduced (red) and Passed (green) vs. Time', fontdict={'fontsize': 24})
ax.hist(data['intro_date'], bins = 500, alpha = .5)
ax.hist(passed_df['intro_date'], bins = 1000, color = 'g')
ax.set_ylim(0, 400)
plt.show()

In [None]:
# First iteration of Random Forest showed that congress_id 115th is a top feature when measured using 
# average Gini importance. The predictions that users will be looking for will always be from the 
# most recent Congress. Therefore, remove congress_id...

# Second iteration removes num_of_cosponsors and bill_char_counts...
data_features = data.loc[:, [
#                           'congress_id', 
                          'num_of_cosponsors', 
                          'sponsor_party', 
                          'sponsor_state', 
#                           'bill_char_counts', 
                            'char_count_bucket',
                          'intro_month', 
                          'session', 
                           'labels']]

In [None]:
# get dummies for intro_month, sponsor_party, sponsor_state, session
data_dumm = pd.get_dummies(data_features, columns = [
#                                             'congress_id', 
                                            'intro_month', 
                                            'sponsor_party', 
                                            'sponsor_state', 
                                            'session', 
                                            'char_count_bucket'
                                            ], 
                           drop_first=True)

In [None]:
data_dumm.head()

In [None]:
def plot_it(x, y_list, name, labels):
    x = x
    fig = plt.figure(figsize = (16, 8))
    ax = fig.add_subplot(111)
    ax.set_title(name, fontdict = {'fontsize': 20})
    for y in y_list:
        plt.plot(x, y)
    plt.legend(labels)

    plt.show

In [None]:
y = data_dumm.pop('labels').values.astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_dumm, y, 
                                                    stratify = y, 
                                                    random_state = 123)

In [None]:
# Iterate through several n_estimators to find best 
n_list = range(600, 640, 2)
r_list = []

print('----------------')
print('Iterating through several n_estimators for a Random Forest model...')

for n in n_list:
    print('--> n = {}'.format(n))
    rf = RandomForestClassifier(n_estimators=n, 
                                max_features = 'auto', 
                                random_state = 123, 
                                n_jobs = -1)
    rf.fit(X_train, y_train)
    
    rf_y_pred = rf.predict(X_test)
    r_list.append(recall_score(y_test, rf_y_pred))


In [None]:
# show graph of scores vs. number of max_features 
x = np.arange(600, 640, 2)
plot_list = [r_list]
graph_name = 'Random Forest n_estimators vs. Recall Score'
labels = ['recall score']

plot_it(x, plot_list, graph_name, labels)

In [None]:
np.argmax(r_list)

In [None]:
# random forest classifier
rf = RandomForestClassifier(n_estimators=610, 
                            max_features = 'auto', 
                            random_state = 123, 
                            n_jobs = -1)
rf.fit(X_train, y_train)

In [None]:
rf_y_pred = rf.predict(X_test)

print('----------------------')
print('----------------------')
print('Random Forest Classification')
print('----------------------')
print('Accuracy score:  {:.2f}'.format(accuracy_score(y_test, rf_y_pred)))
print('Precision score: {:.2f}'.format(precision_score(y_test, rf_y_pred)))
print('Recall score:    {:.2f}'.format(recall_score(y_test, rf_y_pred)))


In [None]:
# Plot the feature importance
feat_scores = pd.Series(rf.feature_importances_,
                           index=X_train.columns)
feat_scores = feat_scores.sort_values()[::-1][:20][::-1]
ax = feat_scores.plot(kind='barh', 
                      figsize=(10,8),
                      color='b')
ax.set_title('Average Gini Importance (Top 20 features)')
ax.set_xlabel('Average contribution to information gain')