In [62]:
import dill as pickle
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
import re
from scipy import stats
from sklearn import metrics
from sklearn.feature_selection import SelectPercentile, SelectFromModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
#in order to use SMOTE, you've got to import Pipeline from imblearn
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
%matplotlib inline

# Unpickle Models

In [51]:
models = ['best_clf_scott_accuracy.pkl', 'best_clf_scott_fbeta.pkl', 
          'best_clf_scott_precision.pkl','best_clf_scott_roc_auc.pkl']
model_map = {k:None for k in models}
for m in models:
    with open(m, 'rb') as f:
        pickled_model = pickle.load(f)
        model_map[m] = pickled_model

# Import Training Data

In [16]:
def create_labeled_df(labeled_data_path):
    '''
    Create a pandas DataFrame with the labled attachment texts.
    
    Arguments:
        labeled_data_path (str): the directory for the labeled attachment text files.
        
    Returns:
        labeled_df (pandas DataFrame): a dataframe with a column for the file name, 
                                       the text, and the label (green, yellow or red).
    '''
    
    texts = []
    files = []
    labels = []
    for file in os.listdir(labeled_data_path):
        if file.startswith('.'):
            continue
        else:
            files.append(file)
            label = file.split('_')[0]
            labels.append(label)
            file_path = os.path.join(labeled_data_path,file)
            #foce utf-8, ignoring erros
            with open(file_path, 'r', errors='ignore') as f:
                text = f.read()
                texts.append(text)
    labeled_df = pd.DataFrame(data=[files,texts,labels]).transpose()
    labeled_df.columns = ['file','text','label']
    
    return labeled_df

labeled_df = create_labeled_df('labeled_fbo_docs')
#recode labels to numeric
labeled_df['target'] = labeled_df['label'].map({'GREEN':0,'YELLOW':1,'RED':1})
labeled_df.head()

Unnamed: 0,file,text,label,target
0,RED_FA8773-10-R-0086.txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSTATEMENT OF W...,RED,1
1,RED_NAMA-10-Q-0119.txt,\nThis is a combined synopsis/solicitation for...,RED,1
2,RED_EA1330-12-RQ-0249.txt,AMENDMENT OF SOLICITATION/MODIFICATION OF CONT...,RED,1
3,RED_FA7014-12-T-1016.txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\nSAF/FMB \n\n\n\n\n...,RED,1
4,GREEN_1055521.txt,\n\nStatement of Work:\n\n1.0 BACKGROUND\nFD...,GREEN,0


# Normalize Text

In [18]:
stop_words = set(stopwords.words('english'))
no_nonsense_re = re.compile(r'^[a-zA-Z^508]+$')
def strip_nonsense(doc):
    """
    Returns lowercased substrings from a string that are at least 3 characters long, do not contain a number, and 
    are no more than 17 chars long.
    """
    doc = doc.lower()
    doc = doc.split()
    words = ''
    for word in doc:
        m = re.match(no_nonsense_re, word)
        if m:
            match = m.group()
            if match in stop_words:
                continue
            else:
                match_len = len(match)
                if match_len <= 17 and match_len >= 3:
                    porter = PorterStemmer()
                    stemmed = porter.stem(match)
                    words += match + ' '
    return words

labeled_df['normalized_text'] = labeled_df['text'].apply(strip_nonsense)

# Plot Results of Each Model

In [60]:
def print_clf_report(models, X_test, y_test):
    for k, clf in models.items():
        print("="*80)
        print(k)
        print("="*80)
        y_pred = clf.predict(X_test)
        print(metrics.classification_report(y_test, y_pred, target_names=['green', 'red']))

In [61]:
X = labeled_df['normalized_text']
y = labeled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=123)

print_model_results(model_map, X_test, y_test)

best_clf_scott_accuracy.pkl
             precision    recall  f1-score   support

      green       0.82      0.57      0.67        54
        red       0.86      0.95      0.90       145

avg / total       0.85      0.85      0.84       199

best_clf_scott_fbeta.pkl
             precision    recall  f1-score   support

      green       0.84      0.39      0.53        54
        red       0.81      0.97      0.88       145

avg / total       0.82      0.81      0.79       199

best_clf_scott_precision.pkl
             precision    recall  f1-score   support

      green       0.90      0.33      0.49        54
        red       0.80      0.99      0.88       145

avg / total       0.83      0.81      0.78       199

best_clf_scott_roc_auc.pkl
             precision    recall  f1-score   support

      green       0.61      0.80      0.69        54
        red       0.91      0.81      0.86       145

avg / total       0.83      0.81      0.82       199



Classifier with `roc_auc` as the scorer is best.

# Compare Results to Dummy Classifiers

In [63]:
def print_dummy_clf_report(X_train, y_train, X_test, y_test):
    for strategy in ['stratified','most_frequent','prior','uniform']:
        print("="*80)
        print(strategy)
        print("="*80)
        dummy = DummyClassifier(strategy=strategy)
        dummy.fit(X_train, y_train)
        y_pred = dummy.predict(X_test)
        print(metrics.classification_report(y_test, y_pred, target_names=['green', 'red']))

In [65]:
X = np.zeros(shape=labeled_df.shape)
y = labeled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=123)
print_dummy_clf_report(X_train, y_train, X_test, y_test)

stratified
             precision    recall  f1-score   support

      green       0.30      0.24      0.27        54
        red       0.74      0.79      0.76       145

avg / total       0.62      0.64      0.63       199

most_frequent
             precision    recall  f1-score   support

      green       0.00      0.00      0.00        54
        red       0.73      1.00      0.84       145

avg / total       0.53      0.73      0.61       199

prior
             precision    recall  f1-score   support

      green       0.00      0.00      0.00        54
        red       0.73      1.00      0.84       145

avg / total       0.53      0.73      0.61       199

uniform
             precision    recall  f1-score   support

      green       0.29      0.46      0.35        54
        red       0.74      0.57      0.65       145

avg / total       0.62      0.54      0.57       199



  'precision', 'predicted', average, warn_for)
