In [1]:
import dill as pickle
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
import re
from scipy import stats
from sklearn import metrics
from sklearn.feature_selection import SelectPercentile, SelectFromModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
#in order to use SMOTE, you've got to import Pipeline from imblearn
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
%matplotlib inline

# Unpickle Models

In [2]:
models = ['clf_csmcallister_accuracy.pkl',
          'clf_csmcallister_precision.pkl',
          'clf_csmcallister_roc_auc.pkl']
model_map = {k:None for k in models}
for m in models:
    with open(m, 'rb') as f:
        pickled_model = pickle.load(f)
        model_map[m] = pickled_model

# Import Training Data

In [6]:
def create_labeled_df(labeled_data_path):
    '''
    Create a pandas DataFrame with the labled attachment texts.
    
    Arguments:
        labeled_data_path (str): the directory for the labeled attachment text files.
        
    Returns:
        labeled_df (pandas DataFrame): a dataframe with a column for the file name, 
                                       the text, and the label (green, yellow or red).
    '''
    
    texts = []
    files = []
    labels = []
    for file in os.listdir(labeled_data_path):
        if file.startswith('.'):
            continue
        else:
            files.append(file)
            label = file.split('_')[0]
            labels.append(label)
            file_path = os.path.join(labeled_data_path,file)
            #foce utf-8, ignoring erros
            with open(file_path, 'r', errors='ignore') as f:
                text = f.read()
                texts.append(text)
    labeled_df = pd.DataFrame(data=[files,texts,labels]).transpose()
    labeled_df.columns = ['file','text','label']
    
    return labeled_df

labeled_df = create_labeled_df('labeled_fbo_docs')
#recode labels to numeric
labeled_df['target'] = labeled_df['label'].map({'GREEN':1,'YELLOW':0,'RED':0})
labeled_df.head()

Unnamed: 0,file,text,label,target
0,RED_FA8773-10-R-0086.txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSTATEMENT OF W...,RED,0
1,RED_NAMA-10-Q-0119.txt,\nThis is a combined synopsis/solicitation for...,RED,0
2,RED_EA1330-12-RQ-0249.txt,AMENDMENT OF SOLICITATION/MODIFICATION OF CONT...,RED,0
3,RED_FA7014-12-T-1016.txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\nSAF/FMB \n\n\n\n\n...,RED,0
4,GREEN_1055521.txt,\n\nStatement of Work:\n\n1.0 BACKGROUND\nFD...,GREEN,1


# Normalize Text

In [7]:
stop_words = set(stopwords.words('english'))
no_nonsense_re = re.compile(r'^[a-zA-Z^508]+$')
def strip_nonsense(doc):
    """
    Returns stemmed lowercased alpha-only substrings from a string that are b/w 3 and 17 chars long. 
    It keeps the substring `508`.
    
    Parameters:
        doc (str): the text of a single FBO document.
        
    Returns:
        words (str): a string of space-delimited lower-case alpha-only words (except for `508`)
    """
    
    doc = doc.lower()
    doc = doc.split()
    words = ''
    for word in doc:
        m = re.match(no_nonsense_re, word)
        if m:
            match = m.group()
            if match in stop_words:
                continue
            else:
                match_len = len(match)
                if match_len <= 17 and match_len >= 3:
                    porter = PorterStemmer()
                    stemmed = porter.stem(match)
                    words += stemmed + ' '
    return words

labeled_df['normalized_text'] = labeled_df['text'].apply(strip_nonsense)

# Plot Results of Each Model

In [17]:
def print_clf_report(models, X_test, y_test):
    for k, clf in models.items():
        print("="*80)
        print(k)
        print("="*80)
        y_pred = clf.predict(X_test)
        print(metrics.classification_report(y_test, y_pred, target_names=['red', 'green']))

In [18]:
X = labeled_df['normalized_text']
y = labeled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=123)

print_clf_report(model_map, X_test, y_test)

clf_csmcallister_accuracy.pkl
             precision    recall  f1-score   support

        red       0.89      0.92      0.90       145
      green       0.76      0.70      0.73        54

avg / total       0.86      0.86      0.86       199

clf_csmcallister_precision.pkl
             precision    recall  f1-score   support

        red       0.90      0.92      0.91       145
      green       0.76      0.72      0.74        54

avg / total       0.86      0.86      0.86       199

clf_csmcallister_roc_auc.pkl
             precision    recall  f1-score   support

        red       0.93      0.88      0.90       145
      green       0.71      0.83      0.77        54

avg / total       0.87      0.86      0.87       199



# Compare Results to Dummy Classifiers

In [21]:
def print_dummy_clf_report(X_train, y_train, X_test, y_test):
    for strategy in ['stratified','most_frequent','prior','uniform']:
        print("="*80)
        print(strategy)
        print("="*80)
        dummy = DummyClassifier(strategy=strategy)
        dummy.fit(X_train, y_train)
        y_pred = dummy.predict(X_test)
        print(metrics.classification_report(y_test, y_pred, target_names=['green', 'red']))

In [22]:
X = np.zeros(shape=labeled_df.shape)
y = labeled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=123)
print_dummy_clf_report(X_train, y_train, X_test, y_test)

stratified
             precision    recall  f1-score   support

      green       0.73      0.71      0.72       145
        red       0.28      0.30      0.29        54

avg / total       0.61      0.60      0.60       199

most_frequent
             precision    recall  f1-score   support

      green       0.73      1.00      0.84       145
        red       0.00      0.00      0.00        54

avg / total       0.53      0.73      0.61       199

prior
             precision    recall  f1-score   support

      green       0.73      1.00      0.84       145
        red       0.00      0.00      0.00        54

avg / total       0.53      0.73      0.61       199

uniform
             precision    recall  f1-score   support

      green       0.78      0.57      0.66       145
        red       0.33      0.56      0.41        54

avg / total       0.65      0.57      0.59       199



  'precision', 'predicted', average, warn_for)
