In [1]:
# CSCE723 Final Project Code

# Author: Joshua White
# Sources: 
# https://realpython.com/python-keras-text-classification/

In [2]:
# Imports:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# Variable setup:
train_filepath = "training_set_0.csv"
test_filepath = "test_set_0.csv"

In [4]:
# Load the pandas dataframes
df_train = pd.read_csv(train_filepath)
df_test = pd.read_csv(test_filepath)


In [5]:
print(df_train.iloc[0])

job_id                                                       401521
business_title                           Human Resources Generalist
category                                                          1
processed_text    human resource generalist deputy director huma...
Name: 0, dtype: object


In [6]:
# The .values takes a df series and turns it into a numpy array. 
documents_train = df_train['processed_text'].values
categories_train = df_train['category'].values

documents_test = df_test['processed_text'].values
categories_test = df_test['category'].values

In [7]:
# Now that we have the baseline model above we need to modify the categories_train & categories_test
#    to work with the keras Sequential model. Right now each entry is just the integer of the class 
#    when it should be a one hot vector. So lets encode the output variable to make it work with keras. 
# Source:
# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
encoder = LabelEncoder()
encoder.fit(categories_train)
encoded_train = encoder.transform(categories_train)
encoded_test = encoder.transform(categories_test)
categorical_train = np_utils.to_categorical(encoded_train)
categorical_test = np_utils.to_categorical(encoded_test)

In [8]:
# We can use this function to graph the accuracy and loss for the training
#    and test data based on the history callback (which is the output of 
#    the fit() method of keras).

plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [9]:
# First we created the neural networks using sparse matrix's of the vocabulary. Now lets use 
#    word embeddings in the neural net. The index 0 is reserved and is not assigned to any
#    word. 

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(documents_train)

X_train = tokenizer.texts_to_sequences(documents_train)
X_test = tokenizer.texts_to_sequences(documents_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Can print an example here
#print(documents_train[2])
#print(X_train[2])

In [10]:
# Now because the lengths of our new X_train is all different we need some way to normalize them.
#    One way to solve this issue is to pad the vectors smaller than some max length with a zero. 

maxlen_train = len(max(documents_train).split(' '))
maxlen_test = len(max(documents_test).split(' '))
maxlen = max(maxlen_train, maxlen_test)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [11]:
# Now that we have checked out a few different types of neural networks let apply
#    a grid search to optimize the hyperparameters for a few different types of
#    models. First lets create a few different functions to create models. 

def create_model_conv_pool(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    #model.add(layers.Flatten()) # lets try to see if we can flatten instead of pool. 
    model.add(layers.Dense(36, activation='relu'))
    #model.add(layers.Dense(24, activation='sigmoid'))
    model.add(layers.Dense(12, activation='softmax'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [12]:
# Main settings:
epochs = 20
embedding_dim = 100
output_file = 'data/output_conv_pool_1.txt'

In [13]:
# Parameter grid for grid search
param_grid = dict(num_filters=[64, 128, 256],
                  kernel_size=[3, 5, 7, 10],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])

model = KerasClassifier(build_fn=create_model_conv_pool,
                        epochs=epochs, batch_size=10,
                        verbose=False)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=2)
grid_result = grid.fit(X_train, categorical_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, categorical_test)






In [14]:
s = ('Best Accuracy : {:.6f}\n{}\nTest Accuracy : {:.6f}\n')
output_string = s.format(grid_result.best_score_,grid_result.best_params_,test_accuracy)
print(output_string)

Best Accuracy : 0.236024
{'embedding_dim': 100, 'kernel_size': 10, 'maxlen': 238, 'num_filters': 256, 'vocab_size': 5490}
Test Accuracy : 0.697548

