In [18]:
import csv
import os
import sys
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from imblearn.pipeline import Pipeline
import dill as pickle
import warnings
from pathlib import Path
from scipy import stats

warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
labeled_df = pd.read_csv('../data/fy23_training_labels.csv')

labeled_df.rename(columns={'attachment': 'text', 'label': 'label'}, inplace=True)
labeled_df.head(5)


Unnamed: 0,text,label
0,"8/21/23, 5:36 PM\n\nSAM.gov\n\n""REGISTER OF WA...",RED
1,"ANL-71-COM (February 13, 2023)\nAPPENDIX A\nAr...",RED
2,"ANL-70 (February 22, 2013)\n| ...",RED
3,"ANL-70B (June 06, 2022)\n\nPRE-AWARD INFORMATI...",RED
4,ANL 562IP (June 2020)\n\nAPPENDIX D-8-C\nINTEL...,RED


In [3]:
#recode labels to numeric, making the minority Green class the positive class
labeled_df['target'] = labeled_df['label'].map({'GREEN':1,'YELLOW':0,'RED':0})

# EDA

In [4]:
# describe the number of chars per text
labeled_df['text'].apply(lambda x: len(x)).describe().apply(lambda x: '%.f' % x)

count      3506
mean      39415
std       59171
min          11
25%        3463
50%       10682
75%       58576
max      893848
Name: text, dtype: object

In [5]:
labeled_df['text'].apply(lambda x: len(set(x.split()))).describe().apply(lambda x: '%.f' % x)

count     3506
mean      1462
std       1616
min          2
25%        282
50%        663
75%       2372
max      12432
Name: text, dtype: object

# Normalize Text

In [6]:
# Download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/adambuckingham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english'))
no_nonsense_re = re.compile(r'^[a-zA-Z^508]+$')
def strip_nonsense(doc):
    """
    Returns stemmed lowercased alpha-only substrings from a string that are b/w 3 and 17 chars long. 
    It keeps the substring `508`.
    
    Parameters:
        doc (str): the text of a single FBO document.
        
    Returns:
        words (str): a string of space-delimited lower-case alpha-only words (except for `508`)
    """
    
    doc = doc.lower()
    doc = doc.split()
    words = ''
    for word in doc:
        m = re.match(no_nonsense_re, word)
        if m:
            match = m.group()
            if match in stop_words:
                continue
            else:
                match_len = len(match)
                if match_len <= 17 and match_len >= 3:
                    porter = PorterStemmer()
                    stemmed = porter.stem(match)
                    words += stemmed + ' '
    return words

In [8]:
# this takes awhile, but is totally worth it
labeled_df['normalized_text'] = labeled_df['text'].apply(strip_nonsense)

In [9]:
labeled_df['normalized_text'].apply(lambda x: len(set(x.split()))).describe().apply(lambda x: '%.f' % x)

count    3506
mean      451
std       416
min         0
25%       121
50%       278
75%       751
max      2704
Name: normalized_text, dtype: object

# Import Model

In [10]:
import pickle

model = pickle.load(open('../src/fbo_scraper/binaries/clf_ajbuckingham_roc_auc.pkl','rb'))

In [11]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

X = labeled_df['normalized_text']
y = labeled_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y,
                                                        stratify=y,
                                                        test_size=0.2,
                                                        random_state=123)
len(X), len(y)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


(3506, 3506)

In [12]:
# Fit model with new data
model.fit(X_train,y_train)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


# Sanity Check

In [13]:
y_pred = model.predict(X_test)

In [14]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, target_names=['red', 'green']))

              precision    recall  f1-score   support

         red       0.98      0.97      0.98       631
       green       0.75      0.86      0.80        71

    accuracy                           0.96       702
   macro avg       0.87      0.91      0.89       702
weighted avg       0.96      0.96      0.96       702



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [15]:
y_train_pred = model.predict(X_train)

In [16]:
print(metrics.classification_report(y_train, y_train_pred, target_names=['red', 'green']))

              precision    recall  f1-score   support

         red       0.99      0.98      0.98      2518
       green       0.82      0.93      0.87       286

    accuracy                           0.97      2804
   macro avg       0.91      0.95      0.93      2804
weighted avg       0.97      0.97      0.97      2804



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
pickle_file = 'clf_retrain_ajbuckingham_roc_auc'

pickle_path = os.path.join(os.getcwd(),pickle_file+'.pkl')
with open(pickle_path, 'wb') as f: 
    pickle.dump(model, f) 