In [None]:
!pip install \
    numpy \
    pandas \
    sklearn \
    matplotlib \
    openpyxl \
    tensorflow \
    tensorflow_hub \
    nltk

In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub

In [None]:
# read in data
raw_data = pd.read_excel('purchase_orders_final.xlsx')

In [None]:
# remove contracts with the same name
# to avoid data leakage between training and validation data
# which would lead to inflated accuracy scores
raw_data = raw_data.drop_duplicates(subset=['CONTRACT_NAME'])

In [None]:
data = pd.DataFrame()

# create a single combined feature based on the three fields with most obvious predictive power
data['feature'] = raw_data['CONTRACT_NAME'].astype(str) + ' ' + \
                  raw_data['EXTENDED_DESCRIPTION'].astype(str) + ' ' + \
                  raw_data['VENDOR_NAME'].astype(str)
# ignoring other fields for now because a simpler model is more easily trained, retrained, and explained
# and there is no point making a more complicated model when a simpler one will be sufficient

In [None]:
# clean and prep the feature data
# << TODO: investigate relevance of digits or punctuation for further iterations >>

# - remove digits
data['feature'] = data['feature'].str.replace(r'\d+', ' ', regex=True)

# - remove punctuation
data['feature'] = data['feature'].str.replace(r'[^\w\s]+', ' ', regex=True)

# - remove repeated whitespace
data['feature'] = data['feature'].str.replace(r'\s+', ' ', regex=True)

# - convert to lowercase
data['feature'] = data['feature'].str.lower()

print(data['feature'])

In [None]:
# remove english stop words which will contain little predictive power
# and will distract from the more important vocabulary within the feature

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

def remove_stopwords(sentence):
    return " ".join([word
                     for word in sentence.split(" ")
                     if word not in stop])

data['feature'] = data['feature'].apply(remove_stopwords)

print(data['feature'])

In [None]:
# inspect the number of labels (codes)
codes = raw_data[['COMMODITY_CODE', 'COMMODITY_DESCRIPTION']].drop_duplicates()
print(codes)

In [None]:
# the comodity codes are simply IDs, for the purpose of simpler model training use one hot encoding
onehot = pd.get_dummies(raw_data['COMMODITY_CODE'])

print(onehot)

In [None]:
# combine feature and labels into a single dataframe
data = data['feature'].to_frame()

data = data.join(onehot)

print(data)

In [None]:
# split data into train and test samples
train, test = train_test_split(data, test_size=0.2)


# split data into input and label
labels = codes['COMMODITY_CODE'].tolist()

train_input = train['feature']
train_labels = train[labels]

test_input = test['feature']
test_labels = test[labels]

In [None]:
# check whether GPU is avaliable
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

In [None]:
# download text embedding layer from TensorFlow hub
# << TODO: investigate performance using other avliable embedding layers >>
# << TODO: investigate extending embedding to perform better with less specific vocabulary in data >>
model = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(model, input_shape=[], dtype=tf.string, trainable=True)

# output two sample feature embeddings
print(hub_layer(train_input[:2]))

In [None]:
# build simple two layer sequential Keras model using
# - text embedding layer
# - simple dense layer outputing to a node for each commodity code
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(len(labels), activation='relu'))

model.summary()

In [None]:
# compile model with choice of standard optimizer and loss functions
# << TODO: investigate performance using different optimizer and loss functions >>

model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])

In [None]:
EPOCHS = 20

# train the model
# fit the model to the training data using the Keras method .fit()
history = model.fit(train_input,
                    train_labels,
                    epochs=EPOCHS,
                    batch_size=256,
                    shuffle=True,
                    validation_split=0.8,
                    verbose=1)

In [None]:
# compute results on test data

test_loss, test_accuracy = model.evaluate(test_input, test_labels)

print("Test Loss: ", test_loss)
print("Test Accuracy: ", test_accuracy)

In [None]:
# investigate predictions on specific examples

eg = 10

example = test_input.iloc[eg]
print("Example input feature (combined contract name, description and vendor: \n", example)

prediction = model.predict([example])

print("\nPrediction: ")
for code_desc, pred in zip(codes["COMMODITY_CODE"].tolist(), prediction[0]):
    print(f"{code_desc:8}", f"{pred:.4f}")

print("\nAnswer: ")
print(test_labels.iloc[eg])

In [None]:
# extract accuracy and loss data across epochs of training from model history object

history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(EPOCHS)

In [None]:
# plot loss against epochs of training

plt.plot(epochs, loss, 'ro', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# plot model accuracy against epochs of training

plt.plot(epochs, acc, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()