In [1]:
import csv
import os
import sys
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from imblearn.pipeline import Pipeline
import dill as pickle
import warnings
from pathlib import Path
from scipy import stats

warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
#labeled_df = pd.read_csv('../data/fy23_training_labels.csv')
labeled_df = pd.read_csv('../data/fy22_23_training_labels.csv')

labeled_df.rename(columns={'attachment': 'text', 'label': 'label'}, inplace=True)
labeled_df.head(5)


Unnamed: 0,text,label
0,Statement of Need\n1. Purpose: Addition of SSD...,RED
1,REVISION: 2\n\n*HISTORY*\n\nENGINEERING DATA L...,RED
2,"406 SCMS/GULAA-Hill\n\nOctober 1, 2015\n\nStat...",RED
3,MANUFACTURING QUALIFICATION REQUIREMENTS\n6685...,RED
4,U.S Department of State (DOS)\n\nBureau of Int...,RED


In [3]:
#recode labels to numeric, making the minority Green class the positive class
labeled_df['target'] = labeled_df['label'].map({'GREEN':1,'YELLOW':0,'RED':0})

# EDA

In [4]:
# describe the number of chars per text
labeled_df['text'].apply(lambda x: len(x)).describe().apply(lambda x: '%.f' % x)

count      92963
mean       48610
std        86216
min            1
25%         3696
50%        10961
75%        60302
max      2334243
Name: text, dtype: object

In [5]:
labeled_df['text'].apply(lambda x: len(set(x.split()))).describe().apply(lambda x: '%.f' % x)

count    92963
mean      1631
std       2045
min          1
25%        302
50%        642
75%       2416
max      47722
Name: text, dtype: object

# Normalize Text

In [6]:
# Download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/adambuckingham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english'))
no_nonsense_re = re.compile(r'^[a-zA-Z^508]+$')
def strip_nonsense(doc):
    """
    Returns stemmed lowercased alpha-only substrings from a string that are b/w 3 and 17 chars long. 
    It keeps the substring `508`.
    
    Parameters:
        doc (str): the text of a single FBO document.
        
    Returns:
        words (str): a string of space-delimited lower-case alpha-only words (except for `508`)
    """
    
    doc = doc.lower()
    doc = doc.split()
    words = ''
    for word in doc:
        m = re.match(no_nonsense_re, word)
        if m:
            match = m.group()
            if match in stop_words:
                continue
            else:
                match_len = len(match)
                if match_len <= 17 and match_len >= 3:
                    porter = PorterStemmer()
                    stemmed = porter.stem(match)
                    words += stemmed + ' '
    return words

In [8]:
# this takes awhile, but is totally worth it
labeled_df['normalized_text'] = labeled_df['text'].apply(strip_nonsense)

labeled_df.to_pickle('fy22_fy23_normalized_training.pkl')


In [9]:
labeled_df['normalized_text'].apply(lambda x: len(set(x.split()))).describe().apply(lambda x: '%.f' % x)

count    92963
mean       483
std        491
min          0
25%        121
50%        264
75%        765
max       5667
Name: normalized_text, dtype: object

# Import Model

In [10]:
import pickle
import pandas as pd

model = pickle.load(open('../src/fbo_scraper/binaries/clf_ajbuckingham_roc_auc.pkl','rb'))

labeled_df = pd.read_pickle('fy22_fy23_normalized_training.pkl')

In [11]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

X = labeled_df['normalized_text']
y = labeled_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y,
                                                        stratify=y,
                                                        test_size=0.2,
                                                        random_state=123)
len(X), len(y)

(92963, 92963)

In [12]:
# Fit model with new data
model.fit(X_train,y_train)

: 

# Sanity Check

In [13]:
y_pred = model.predict(X_test)

In [14]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, target_names=['red', 'green']))

              precision    recall  f1-score   support

         red       0.98      0.97      0.98       631
       green       0.75      0.86      0.80        71

    accuracy                           0.96       702
   macro avg       0.87      0.91      0.89       702
weighted avg       0.96      0.96      0.96       702



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [15]:
y_train_pred = model.predict(X_train)

In [1]:
print("Classification Report:")
print(metrics.classification_report(y_train, y_train_pred, target_names=['red', 'green']))

Classification Report:


NameError: name 'metrics' is not defined

In [None]:
pickle_file = 'clf_retrain_ajbuckingham_roc_auc'

pickle_path = os.path.join(os.getcwd(),pickle_file+'.pkl')
with open(pickle_path, 'wb') as f: 
    pickle.dump(model, f) 