In [None]:
from pymongo import MongoClient
import numpy as np
import pandas as pd
from my_tools import get_bill_data, process_corpus, read_jsonl_file
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix

from sklearn.externals import joblib

# get bill data
data, in_progress = get_bill_data()

corpus_with_labels = read_jsonl_file('/home/ubuntu/galvanize_capstone/data/nlp/corpus_with_labels.jsonl')

corpus_df = pd.DataFrame(list(corpus_with_labels))

X = corpus_df['document']
y = corpus_df['label'].astype(int)

In [None]:
# create stratified train-test split
print('-------------------')
print('Doing train-test split...')
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)#, random_state = 123)

# vectorizing ~28M dimensions with n-grams, l1 norm (simple avg, l2 avg**2
# use_idf=True gives more weight to words, n_grams that appear less frequently in the corpus
# sublinear_tf=True reduces the bias of length
print('-------------------')
print('Vectorizing...')
tfvect = TfidfVectorizer(ngram_range = (1, 4), 
                         max_features = 6000000,
                         norm = 'l2',              #default value
                         use_idf = True,           #default value
                         sublinear_tf = True)

X_train_vec = tfvect.fit_transform(X_train)
X_test_vec = tfvect.transform(X_test)

# create vocabulary
vocab = tfvect.vocabulary_

In [None]:
print('-------------------')
print('Training Random Forest Classifier with vectorized results...')
rf = RandomForestClassifier(n_estimators = 100, 
#                             max_features = 500000,
                            max_depth = None, 
                            min_samples_split = 2, 
                            min_samples_leaf = 1, 
                            n_jobs = -1)
rf.fit(X_train_vec, y_train)

rf_y_pred = rf.predict(X_test_vec)
rf_y_pred_proba = rf.predict_proba(X_test_vec)

print('F1 Score:\t\t{:.4f}'.format(f1_score(y_test, rf_y_pred)))
print('Recall Score:\t\t{:.4f}'.format(recall_score(y_test, rf_y_pred)))
print('Precision Score:\t{:.4f}'.format(precision_score(y_test, rf_y_pred)))
print('Accuracy Score:\t\t{:.4f}'.format(accuracy_score(y_test, rf_y_pred)))

In [None]:
# Plot the feature importance
feat_scores = pd.Series(rf.feature_importances_,
                           index=vocab)

feat_scores = feat_scores.sort_values()[::-1][:50][::-1]
ax = feat_scores.plot(kind='barh', 
                      figsize=(10,8),
                      color='b')
ax.set_title('Average Gini Importance (Top 50 features)')
ax.set_xlabel('Average contribution to information gain')

In [None]:
rf.n_estimators

In [None]:
confusion_matrix(y_test, rf_y_pred)

In [None]:
pd.Series(rf_y_pred_proba[:, 1]).unique()

In [None]:
rf_y_pred_proba[45, 1]

In [None]:
# dump the TfidfVectorizer
print('Pickling the TfidfVectorizer...')
joblib.dump(tfvect, 'pickle_files/tfidfVectorizer.pkl')

In [None]:
# dump the RandomForest Classifier
print('Pickling the Random Forest Classifier...')
joblib.dump(rf, 'pickle_files/nlp_randomForest.pkl')

In [None]:
# rf_y_pred_proba[0]

In [None]:
# my recall scores seem abnormally high... let's iterate through a few different slices to build 
# confidence in our results
recall_scores = []

i = 1
while i < 6:
    print('-------------')
    print('Performing train-test split and vectorizing for iteration {}...'.format(i))
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y)
    
    tfvect = TfidfVectorizer(ngram_range=(1, 5))
    X_train_vec = tfvect.fit_transform(X_train)
    X_test_vec = tfvect.transform(X_test)
    
    print('\tFitting data to Random Forest Classifier...')
    rf = RandomForestClassifier(max_features = 'auto', n_jobs = -1)
    rf.fit(X_train, y_train)

    rf_y_pred = rf.predict(X_test)
    recall_scores.append(recall_score(y_test, rf_y_pred))
    
    i+=1
    

print('Mean recall score after {} iterations: {}'.format(i, np.mean(recall_scores)))