In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
#Read in data from CSV files.
df = pd.read_csv('raw_data/PO_Dataset.csv')
name_mapping = pd.read_csv('clean_data/Clean_Code_Master_list.csv')
df.head()


Unnamed: 0,Company #,Purchase Order,Item,Vendor,Description,Unit of Measure,Units,Unit Cost,Cost,Cost Code
0,8,1200-001,1,Paragon Electrical Installations Ltd.,Additional smoke detector/re-verification,LS,0,0.0,1444.0,26-20-20
1,8,1200-002,1,Accurate Aluminum Ltd,S&I railing as per quote Aug. 13 2015,LS,0,0.0,500.0,05-52-20
2,8,1200-003,1,Dura Productions,S&I metal ramp,LS,0,0.0,795.0,05-52-20
3,8,1200-004,1,Friesen Floors & Window Fashions Ltd,S&I hardwood flooring for enclosed balcony area,LS,0,0.0,2314.0,09-64-33
4,8,1209-1-01,1,Alba Painting Ltd.,Painting of two offices,LS,0,0.0,900.0,09-91-40


In [3]:
#The Convert the Units column to float
df['Units'] = pd.to_numeric(df['Units'], errors='coerce').fillna(0)
df['Units'] = df['Units'].astype('float64')

#Drop lines with null values
df.dropna(inplace=True)

#Read in Master list of valid cost codes
df_ml = pd.read_csv('raw_data/Code_Master_list.csv')

#Drop rows where the cost code is not in the master list
df = df[df['Cost Code'].isin(df_ml['Cost Code'])].dropna()

#Create a new dataframe that takes only the 90th quartile of data from the 3 numerical columns.
df_90 = df[df['Cost'] < df['Cost'].quantile(.90)]
df_90 = df_90[df_90['Units'] < df_90['Units'].quantile(.90)]
df_90 = df_90[df_90['Unit Cost'] < df_90['Unit Cost'].quantile(.90)]

# It's a good practice to scale numerical data
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() 
numerical = ['Units','Unit Cost','Cost']

df_90[numerical] = scaler.fit_transform(df_90[numerical])

# When splitting for training and testing later, we'll need a minimum of 2 examples of each cost code.
# Assign cost code to a variable
df_count = df_90['Cost Code'].value_counts()

#New dataframe only includes lines with cost codes with a count of 2 or greater
df_90 = df_90[~df_90['Cost Code'].isin(df_count[df_count < 2].index)]


#One Hot Encode categorical features
categorical = ['Vendor', 'Unit of Measure']
df_90 = pd.get_dummies(df_90, columns = categorical )

#Numerically encode cost codes.
le = LabelEncoder()
cost_code = df_90['Cost Code']
df_90['Cost Code Encoded'] = le.fit_transform(cost_code)

#drop features I won't be using
df_90 = df_90.drop(['Company #','Purchase Order', 'Item'], axis = 1)

df = df_90


In [4]:
cost_codes = df['Cost Code Encoded']
features = df.drop(['Cost Code','Cost Code Encoded'], axis=1)


In [5]:
#Use sklearn train test split to split the data into training and testing sets. 
#Testing set is 20% of total dataset size.
#Stratify the data so we don't introduce bias in the sets.

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    cost_codes,
                                                    test_size = 0.2,
                                                    stratify = cost_codes
                                                   )



In [6]:
print(X_train.shape)
print(y_train.shape)

(22739, 487)
(22739,)


In [7]:
#Split out X_train and X_test Descriptions for use in sepearate model.
X_train_desc = X_train['Description'].copy()
X_train = X_train.drop('Description', axis=1)

X_test_desc = X_test['Description'].copy()
X_test = X_test.drop('Description', axis=1)

In [9]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV


pipeline = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(random_state=42, tol = 1e-3)),
               ])
parameters = {
    #'clf__loss':['hinge','log'],
    #'clf__penalty':['l1','l2'],
    #'clf__alpha':[1e-3,1e-4],
    #'clf__max_iter':[15,20,25]
    'clf__penalty':['l2'],
    'clf__alpha':[1e-4],
    'clf__max_iter':[20]
}

CV = GridSearchCV(pipeline, parameters, scoring = 'f1', n_jobs=1, cv = 5)

CV.fit(X_train_desc, y_train)
print('Best score and parameter combination = ')
print(CV.best_score_)    
print(CV.best_params_) 


y_pred = CV.predict(X_test_desc)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))



Best score and parameter combination = 
0.07346552835688824
{'clf__alpha': 0.0001, 'clf__max_iter': 20, 'clf__penalty': 'l2'}
accuracy 0.43447669305189096
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.83      0.95      0.89       112
           3       0.44      0.88      0.58         8
           4       0.89      0.83      0.86        29
           5       0.89      0.84      0.86        19
           6       1.00      0.60      0.75         5
           7       0.06      0.13      0.08        23
           9       0.60      0.60      0.60         5
          10       0.00      0.00      0.00         1
          11       0.32      0.33      0.32        46
          12       0.66      0.73      0.69       156
          13       0.00      0.00      0.00         1
          14       0.15      0.38      0.22        29
          15       0.00      0.00      0.00         2
          16       0.33      0.33 

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


SGDC Classifier Results: 

Best Parameters:clf__alpha': 0.0001, 'clf__loss': 'hinge', 'clf__max_iter': 20, 'clf__penalty': 'l2'}
accuracy 0.44802110817941954

In [10]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(random_state=42,multi_class='multinomial')),
               ])
parameters = {
    'clf__C':[100],
    'clf__solver':['lbfgs'],
    'clf__max_iter':[1000]
}

CV = GridSearchCV(pipeline, parameters, scoring = 'f1', n_jobs=1, cv = 5)

CV.fit(X_train_desc, y_train)
print('Best score and parameter combination = ')
print(CV.best_score_)    
print(CV.best_params_) 


y_pred = CV.predict(X_test_desc)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))



Best score and parameter combination = 
0.18659155552723491
{'clf__C': 100, 'clf__max_iter': 1000, 'clf__solver': 'lbfgs'}
accuracy 0.4712401055408971
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         2
           2       0.93      0.96      0.94       112
           3       0.62      0.62      0.62         8
           4       0.83      0.69      0.75        29
           5       0.77      0.89      0.83        19
           6       1.00      0.60      0.75         5
           7       0.41      0.57      0.47        23
           9       0.67      0.40      0.50         5
          10       0.00      0.00      0.00         1
          11       0.47      0.41      0.44        46
          12       0.57      0.79      0.66       156
          13       0.00      0.00      0.00         1
          14       0.59      0.45      0.51        29
          15       0.00      0.00     

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Linear regression results: 
'clf__C': 100, 'clf__max_iter': 1000, 'clf__solver': 'lbfgs'
accuracy 0.47634124890061563


In [11]:
X_test['Desc Pred'] = y_pred


In [12]:
X_train['Desc Pred'] = CV.predict(X_train_desc)

In [14]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

parameters = {'max_depth': [200],
              'min_samples_split': [2],
              'min_samples_leaf': [2],
              'n_estimators': [1000]
             }

CV = GridSearchCV(clf, parameters, scoring = 'f1', n_jobs=1, cv = 5)

CV.fit(X_train, y_train)
print('Best score and parameter combination = ')
print(CV.best_score_)    
print(CV.best_params_) 


y_pred = CV.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))





Best score and parameter combination = 
0.5520303932745646
{'max_depth': 500, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
accuracy 0.5282321899736148
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.80      0.97      0.88       112
           3       1.00      0.88      0.93         8
           4       0.95      0.62      0.75        29
           5       0.94      0.79      0.86        19
           6       1.00      0.60      0.75         5
           7       0.80      0.17      0.29        23
           9       1.00      1.00      1.00         5
          10       0.00      0.00      0.00         1
          11       0.58      0.30      0.40        46
          12       0.67      0.78      0.72       156
          13       0.00      0.00      0.00         1
          14       0.91      0.34      0.50        29
          15       0.00      0.00      0.00         2
          16  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier()

parameters = {'booster': ['gbtree', 'gblinear']
             }

CV = GridSearchCV(clf, parameters, scoring = 'f1', n_jobs=1, cv = 5)

CV.fit(X_train, y_train)
print('Best score and parameter combination = ')
print(CV.best_score_)    
print(CV.best_params_) 


y_pred = CV.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))




In [28]:
df_output = pd.DataFrame({'Text':X_test_desc ,'Prediction':le.inverse_transform(y_pred), 'Actual':le.inverse_transform(y_test)})

In [29]:
name_mapping.head()

Unnamed: 0,Cost Code,Description
0,00-10-17,geotechnical consultant
1,00-61-13,subtrade bonds
2,01-30-12,project manager
3,01-30-14,project coordinator
4,01-30-23,finishing superintendent


In [41]:
out = df_output.merge(name_mapping, left_on='Actual',right_on='Cost Code')
out.rename(index=str, columns={'Description':'Actual Desc.'}, inplace = True)
out.drop(['Cost Code'], axis = 1, inplace = True)
out.head()

Unnamed: 0,Text,Prediction,Actual,Actual Desc.
0,Danger,01-52-16,01-52-17,safety equipment & supplies
1,Air horns,01-52-17,01-52-17,safety equipment & supplies
2,92.600 LRG,01-52-16,01-52-17,safety equipment & supplies
3,annual inspect extinguisher,01-52-17,01-52-17,safety equipment & supplies
4,Strips Bandage,01-52-16,01-52-17,safety equipment & supplies


In [42]:
out = out.merge(name_mapping, left_on='Prediction',right_on='Cost Code')
out.drop(['Cost Code'], axis = 1, inplace=True)
out.rename(index=str, columns={'Description':'Predicted Desc.'}, inplace=True)
out.head()

Unnamed: 0,Text,Prediction,Actual,Actual Desc.,Predicted Desc.
0,Danger,01-52-16,01-52-17,safety equipment & supplies,first aid supplies
1,92.600 LRG,01-52-16,01-52-17,safety equipment & supplies,first aid supplies
2,Strips Bandage,01-52-16,01-52-17,safety equipment & supplies,first aid supplies
3,danger taper red,01-52-16,01-52-17,safety equipment & supplies,first aid supplies
4,batteries AAA (box of24),01-52-16,01-52-17,safety equipment & supplies,first aid supplies


In [43]:
cols = ['Text', 'Prediction','Predicted Desc.', 'Actual', 'Actual Desc.']
out = out[cols]

In [45]:
out.tail(50)

Unnamed: 0,Text,Prediction,Predicted Desc.,Actual,Actual Desc.
5635,INV#321539 - materials for layout of walls and...,06-10-60,framing back framing,06-10-60,framing back framing
5636,chicken wire,10-51-16,wood lockers,01-56-30,temporary walkways supporting trailers
5637,"CHICKEN WIRE 1""X36""X25'",10-51-16,wood lockers,10-51-16,wood lockers
5638,SCAFFOLDING QUOTE# 3141-REV1,32-93-50,soft landscaping,01-56-30,temporary walkways supporting trailers
5639,Mini excavator for storm pipe trenching and gr...,03-21-30,concrete dowels,03-31-46,concrete material - sog
5640,30 x 42 mirror,08-83-13,mirrors,10-28-16,toilet & bath accessories
5641,02-Apr - Deficiencies on L31 L32,01-30-51,deficiency technician,01-30-51,deficiency technician
5642,Feb 21 2019 - Deficiencies - L16 17 20,01-30-51,deficiency technician,01-30-51,deficiency technician
5643,Feb 12 - Deficiencies - Prep L19,01-30-51,deficiency technician,01-30-51,deficiency technician
5644,19-Feb-2019 - Deficiencies - L19/L20,01-30-51,deficiency technician,01-30-51,deficiency technician
