In [61]:
%%writefile my_tools.py
import json
import codecs
import copy

import pandas as pd
from pymongo import MongoClient
import datetime

from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet  import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams, skipgrams
import string
import re




def read_jsonl_file(path):
    '''turn a jsonl file (carriage returns per record) into an array of objects'''
    arr = []
    f = codecs.open(path, 'r', 'utf-8')
    for line in f:
        record = json.loads(line.rstrip('\n|\r'))
        arr.append(record)
    return arr


def read_json_file(path):
    '''Turn a normal json file (no carriage returns per record) into an object'''
    text = codecs.open(path, 'r', 'utf-8').read()
    return json.loads(text)


def write_jsonl_file(list_of_objects, path):
    '''Dump a list of objects out as a jsonl file'''
    f = codecs.open(path, 'w', 'utf-8')
    for row in list_of_objects:
        json_record = json.dumps(row, ensure_ascii = False)
        f.write(json_record + '\n')
    f.close()

    
def write_json_file(obj, path):
    '''Dump an object and write it out as json to a file'''
    f = codecs.open(path, 'a', 'utf-8')
    json_record = json.dumps(obj, ensure_ascii = False)
    f.write(json_record + '\n')
    f.close
    
    

def get_bill_data():
    '''
    Query data from mongo db bills.bill_details and return a pandas dataframe.
    
    The data relevant to this project is currently set up to be limited to the 
    110th Congress forward.
    --------------------
    Parameters: None.
    --------------------    
    Returns: pandas dataframe with relevant data and corresponding labels.
                
    '''
    # connect to mongodb
    client = MongoClient() # defaults to localhost
    db = client.bills
    bill_details = db.bill_details
    
    # get mongoo data and convert mongo query resuls to dataframe
    # need to execute query (.find) everytime i refer to it?
    records_with_text = bill_details.find({'body': {'$regex': 'e'}})
    data = pd.DataFrame(list(records_with_text))
    
    
    
    # DATA CLEANUP
    # filter out simple resolutions, concurrent resolutions, and amendments (for prelim model)
    data = data[(data['leg_type'] != 'RESOLUTION') & (data['leg_type'] != 'CONCURRENT RESOLUTION') & (data['leg_type'] != 'AMENDMENT')].copy()
    
    # create column for character counts of the bill text
    bill_lengths = list(map(lambda x: len(x), data['body']))
    data['bill_char_counts'] = bill_lengths
    
    # convert date column to type datetime
    data['intro_date'] = data['intro_date'].apply(lambda x: datetime.datetime.strptime(x[:10], '%m/%d/%Y'))

    # strip out month from intro date
    data['intro_month'] = data['intro_date'].apply(lambda x: x.month)
    
    # get session from year (odd years are Session 1, even years are Session 2)
    data['session'] = data['congress_id'].apply(lambda x: 2 if int(x[:3])%2 == 0 else 1)
    
    # filter out non-numeric num_of_cosponsors: S. Rept. 110-184, TXT, All Actions
    data = data[(data['num_of_cosponsors'] != 'S. Rept. 110-184') &
               (data['num_of_cosponsors'] != 'TXT') &
               (data['num_of_cosponsors'] != 'All Actions')].copy()
    
    # correction for mislabeled sponsor_state and sponsor_party
    state = copy.copy(data['sponsor_state'])
    party = copy.copy(data['sponsor_party'])
    data['sponsor_state'] = party
    data['sponsor_party'] = state
    
    # create column for getting char_counts into buckets
    data['char_count_bucket'] = None

    d_0 = data[data['bill_char_counts'] <= 1000].copy()
    d_1000 = data[(data['bill_char_counts'] > 1000) & (data['bill_char_counts'] <= 2000)].copy()
    d_2000 = data[(data['bill_char_counts'] > 2000) & (data['bill_char_counts'] <= 3000)].copy()
    d_3000 = data[(data['bill_char_counts'] > 3000) & (data['bill_char_counts'] <= 4000)].copy()
    d_4000 = data[(data['bill_char_counts'] > 4000) & (data['bill_char_counts'] <= 5000)].copy()
    d_5000 = data[(data['bill_char_counts'] > 5000) & (data['bill_char_counts'] <= 6000)].copy()
    d_6000 = data[(data['bill_char_counts'] > 6000) & (data['bill_char_counts'] <= 7000)].copy()
    d_7000 = data[(data['bill_char_counts'] > 7000) & (data['bill_char_counts'] <= 8000)].copy()
    d_8000 = data[(data['bill_char_counts'] > 8000) & (data['bill_char_counts'] <= 9000)].copy()
    d_9000 = data[(data['bill_char_counts'] > 9000) & (data['bill_char_counts'] <= 10000)].copy()
    d_10000 = data[data['bill_char_counts'] > 10000].copy()


    d_0['char_count_bucket'] = 'less than 1000'
    d_1000['char_count_bucket'] = '1001 - 2000'
    d_2000['char_count_bucket'] = '2001 - 3000'
    d_3000['char_count_bucket'] = '3001 - 4000'
    d_4000['char_count_bucket'] = '4001 - 5000'
    d_5000['char_count_bucket'] = '5001 - 6000'
    d_6000['char_count_bucket'] = '6001 - 7000'
    d_7000['char_count_bucket'] = '7001 - 8000'
    d_8000['char_count_bucket'] = '8001 - 9000'
    d_9000['char_count_bucket'] = '9001 - 10000'
    d_10000['char_count_bucket'] = 'greater than 10000'

    data = pd.concat([d_0, d_1000, d_2000, d_3000, d_4000, d_5000, 
                      d_6000, d_7000, d_8000, d_9000, d_10000])

    data = data.sort_index()

    
    
    
    # LABELING
#     print('------------------')
#     print('Creating column \'labels\'...')
    
    # break up dataframe into those that became law and others (did not or still pending)
    became_law = data[(data['bill_status'] == 'Became Law') | (data['bill_status'] == 'Became Private Law')].copy()
    others = data[(data['bill_status'] != 'Became Law') & (data['bill_status'] != 'Became Private Law')].copy()

    became_law.loc[:, 'labels'] = 1
#     print('became_law: {}'.format(became_law.shape))



    # break up others into current congress and previous ones. Anything that hasn't been signed into law
    # before current session is dead. Currently, all bills vetoed by the president come from previous congresses
    current_cong = others[others['congress_id'] == '115th'].copy()
    prev_cong = others[others['congress_id'] != '115th'].copy()

    prev_cong.loc[:, 'labels'] = 0
#     print('prev_cong: {}'.format(prev_cong.shape))



    # let's label To President and Resolving Differences with 1. Everything else is on the floor
    to_pres = current_cong[(current_cong['bill_status'] == 'To President') | (current_cong['bill_status'] == 'Resolving Differences')].copy()
    on_floor = current_cong[(current_cong['bill_status'] != 'To President') & (current_cong['bill_status'] != 'Resolving Differences')].copy()

    to_pres.loc[:, 'labels'] = 1
#     print('to_pres: {}'.format(to_pres.shape))


    # break up bills on the floor to failed (0) and not failed
    failed = on_floor[on_floor['bill_status'].str.startswith('Failed')].copy()
    not_failed = on_floor[~on_floor['bill_status'].str.startswith('Failed')].copy()

    failed.loc[:, 'labels'] = 0
#     print('failed: {}'.format(failed.shape))



    # bills that haven't failed yet have either been just introduced or on their way
    # label introduced with 'in_progress'. These will not be a part of our model.
    introduced = not_failed[not_failed['bill_status'] == 'Introduced'].copy()
    beyond_intro = not_failed[not_failed['bill_status'] != 'Introduced'].copy()

    introduced.loc[:, 'labels'] = 'in_progress'
#     print('introduced: {}'.format(introduced.shape))



    # there are bills that started in one chamber and have already passed the other. We'll label
    # these with a 1
    passed_opp_chamber = beyond_intro[(beyond_intro['bill_status'] == 'Passed House') & (beyond_intro['leg_id'].str.startswith('S')) | 
                              (beyond_intro['bill_status'] == 'Passed Senate') & (beyond_intro['leg_id'].str.startswith('H'))].copy()

    passed_opp_chamber.loc[:, 'labels'] = 1
#     print('passed_opp_chamber: {}'.format(passed_opp_chamber.shape))



    # bills that are still in the chamber they were introduced in are 'in_progress'
    in_orig_chamber = beyond_intro[(beyond_intro['bill_status'] == 'Passed House') & (beyond_intro['leg_id'].str.startswith('H')) | 
                              (beyond_intro['bill_status'] == 'Passed Senate') & (beyond_intro['leg_id'].str.startswith('S'))].copy()

    in_orig_chamber.loc[:, 'labels'] = 'in_progress'
#     print('in_orig_chamber: {}'.format(in_orig_chamber.shape))



    # bring all the information back together
    data_l = pd.concat([became_law, prev_cong, to_pres, failed, introduced, passed_opp_chamber, in_orig_chamber])
#     print('data_l: {}'.format(data_l.shape))

    # filter out those that are still in progress
    df = data_l[data_l['labels'] != 'in_progress'].copy()
#     print('df: {}'.format(df.shape))

    # filter for most recent congress_ids
    small_df = df[(df['congress_id'] == '115th') | 
              (df['congress_id'] == '114th') | 
              (df['congress_id'] == '113th')| 
              (df['congress_id'] == '112th')| 
              (df['congress_id'] == '111th')| 
              (df['congress_id'] == '110th')].copy()
#     print('small_df: {}'.format(small_df.shape))
    
    print('------------------')
    print('------------------')
    print('Data is from the 110th Congress (2007) to present')
    print('Alter masking in my_tools.get_bill_data to get a different data set.')
    print('------------------')
    
    small_df.reset_index(inplace = True)
    
    return small_df



def process_corpus(df, corpus_col_name, labels_col_name):
    '''
    Processes the text in df[corpus_col_name] to return a corpus (list) and the series of 
    corresponding labels in df[label_col_name].
    
    The intent of this function is to feed the output into a stratified train-test split.
    -------------------
    Parameters: df - pandas dataframe
                col_name - name of column in df that contains the text to be processed.
    -------------------
    Returns: X - a list of documents
             y - a pandas series of corresponding labels
    '''
    # create a corpus
    print('------------------')
    print('Creating corpus...')
    documents = list(df[corpus_col_name])

    # remove numbers
    documents = list(map(lambda x: ' '.join(re.split('[,_\d]+', x)), documents))
    
    # clip the intro of each bill
    documents = list(map(lambda x: x[(x.index('Office]') + 8):], documents))

    # tokenize the corpus
    print('------------------')
    print('Tokenizing...')
    corpus = [word_tokenize(content.lower()) for content in documents]

    # strip out the stop words from each 
    print('------------------')
    print('Stripping out stop words, punctuation, and numbers...')
    stop_words = stopwords.words('english')
    stop_words.extend(['mr', 'ms', 'mrs', 'said', 'year', 'would', 'could', 'also', 'shall', '_______________________________________________________________________'])
    # print(stop_words)
    corpus = [[token for token in doc if token not in stop_words] for doc in corpus]
    # corpus[0]

    # strip out the punctuation
    punc = set(string.punctuation)
    # print(punc)
    corpus = [[token for token in doc if token not in punc] for doc in corpus]
    # corpus[0]

    # strip out the punctuation
    string.digits


    # lemmatize (and maybe stem?)
    print('------------------')
    print('Lemmatizing...')
    lemmer = WordNetLemmatizer()
    corpus = [[lemmer.lemmatize(word) for word in doc] for doc in corpus]
    # corpus[0]

    # build a vocabulary
    print('------------------')
    print('Creating a vocabulary...')
    vocab_set = set()
    [[vocab_set.add(token) for token in tokens] for tokens in corpus]
    vocab = list(vocab_set)
    # vocab[100000:100020]

    # # for later model...
    # # examine n-grams...
    # # bigrams (two words side-by-side)
    # print('------------------')
    # print('Creating lists of bigrams, trigrams, skipgrams, etc...')
    # bigrams = [list(ngrams(sequence = doc, n = 2)) for doc in corpus]
    # trigrams = [list(ngrams(sequence = doc, n = 3)) for doc in corpus]
    # #... more?

    # # skipgrams (n-grams that skip k words)
    # skipgrams = [list(skipgrams(sequence = doc, n = 2, k = 1)) for doc in corpus]


    # rejoin each doc in corpus so each doc is a single string
    corpus = [' '.join(tokens) for tokens in corpus]

    print('------------------')
    print('NLP preprocessing complete ...')

    X = corpus
    y = df[labels_col_name].astype('int')
    
    return X, y

Overwriting my_tools.py


In [55]:
# from my_tools import *
data = get_bill_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


became_law: (3107, 21)
prev_cong: (53104, 21)
to_pres: (31, 21)
failed: (4, 21)
introduced: (9900, 21)
passed_opp_chamber: (14, 21)
in_orig_chamber: (879, 21)
data_l: (67039, 21)
df: (56260, 21)
small_df: (52240, 21)
------------------
------------------
Data is from the 110th Congress (2007) to present
------------------


In [56]:
data.shape

(52240, 22)

In [57]:
data.head()

Unnamed: 0,index,_id,bill_status,body,committee,congress_id,cosponsors,cosponsors_url,desc,intro_date,...,leg_url,num_of_cosponsors,sponsor,sponsor_district,sponsor_party,sponsor_state,bill_char_counts,intro_month,session,labels
0,865,5c182d4c1417de23a825d5d2,Became Law,[114th Congress Public Law 319] [From the U.S....,,114th,,https://www.congress.gov/bill/114th-congress/h...,Foreign Cultural Exchange Jurisdictional Immun...,2016-12-08,...,https://www.congress.gov/bill/114th-congress/h...,3,"Rep. Chabot, Steve",1,R,OH,5366,12,2,1
1,890,5c182d4c1417de23a825d5eb,Became Law,[114th Congress Public Law 327] [From the U.S....,"House - Natural Resources, Science, Space, and...",114th,,https://www.congress.gov/bill/114th-congress/h...,Ensuring Access to Pacific Fisheries Act,2016-12-07,...,https://www.congress.gov/bill/114th-congress/h...,2,"Rep. Radewagen, Aumua Amata Coleman",At Large,R,AS,68574,12,2,1
2,891,5c182d4c1417de23a825d5ec,Became Law,[114th Congress Public Law 318] [From the U.S....,"House - Oversight and Government Reform, Trans...",114th,,https://www.congress.gov/bill/114th-congress/h...,Federal Property Management Reform Act of 2016,2016-12-07,...,https://www.congress.gov/bill/114th-congress/h...,1,"Rep. Denham, Jeff",10,R,CA,24935,12,2,1
3,892,5c182d4c1417de23a825d5ed,Became Law,[114th Congress Public Law 317] [From the U.S....,House - Oversight and Government Reform,114th,,https://www.congress.gov/bill/114th-congress/h...,Inspector General Empowerment Act of 2016,2016-12-07,...,https://www.congress.gov/bill/114th-congress/h...,2,"Rep. Chaffetz, Jason",3,R,UT,33853,12,2,1
4,911,5c182d4c1417de23a825d600,Became Law,[114th Congress Public Law 316] [From the U.S....,House - Judiciary,114th,,https://www.congress.gov/bill/114th-congress/h...,"Promoting Travel, Commerce, and National Secur...",2016-12-02,...,https://www.congress.gov/bill/114th-congress/h...,23,"Rep. Kuster, Ann M.",2,D,NH,3465,12,2,1


In [49]:
bill_lengths = list(map(lambda x: len(x), data['body']))
data['bill_char_counts'] = bill_lengths

In [41]:
# create a corpus
print('------------------')
print('Creating corpus...')
documents = list(data['body'])

# remove numbers
documents = list(map(lambda x: ' '.join(re.split('[,_\d]+', x)), documents))

len(documents)

------------------
Creating corpus...


50039