In [None]:
# CSCE723 Final Project Code

# Author: Joshua White
# Sources: 
# https://realpython.com/python-keras-text-classification/

In [None]:
# Imports:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
# Variable setup:
train_filepath = "training_set_0.csv"
test_filepath = "test_set_0.csv"

In [None]:
# Load the pandas dataframes
df_train = pd.read_csv(train_filepath)
df_test = pd.read_csv(test_filepath)


In [None]:
print(df_train.iloc[0])

In [None]:
# The .values takes a df series and turns it into a numpy array. 
documents_train = df_train['processed_text'].values
categories_train = df_train['category'].values

documents_test = df_test['processed_text'].values
categories_test = df_test['category'].values

In [None]:
# Set up the SciKit Learn vectorizer to turn the documents into a sparse matrix. 
vectorizer = CountVectorizer()
vectorizer.fit(documents_train)

X_train = vectorizer.transform(documents_train)
X_test = vectorizer.transform(documents_test)
X_train

In [None]:
# Now lets create a logistic regression model, fit it, and get a score. 
# Note: had to raise max_iter because the fit was not converging. 
#    Default max_iter is 100. 
classifier = LogisticRegression(max_iter = 500)
classifier.fit(X_train, categories_train)
score = classifier.score(X_test, categories_test)

print("Accuracy:", score)

In [None]:
# Now that we have the baseline model above we need to modify the categories_train & categories_test
#    to work with the keras Sequential model. Right now each entry is just the integer of the class 
#    when it should be a one hot vector. So lets encode the output variable to make it work with keras. 
# Source:
# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
encoder = LabelEncoder()
encoder.fit(categories_train)
encoded_train = encoder.transform(categories_train)
encoded_test = encoder.transform(categories_test)
categorical_train = np_utils.to_categorical(encoded_train)
categorical_test = np_utils.to_categorical(encoded_test)

In [None]:
categorical_train

In [None]:
input_dim = X_train.shape[1]

model = Sequential()
model.add(layers.Dense(48, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(12, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, categorical_train,
                    epochs=7,
                    verbose=False,
                    validation_data=(X_test, categorical_test),
                    batch_size=10)

In [None]:
loss, accuracy = model.evaluate(X_train, categorical_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, categorical_test)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
# We can use this function to graph the accuracy and loss for the training
#    and test data based on the history callback (which is the output of 
#    the fit() method of keras).

plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
print(history.history.keys())

In [None]:
plot_history(history)

In [None]:
# First we created the neural networks using sparse matrix's of the vocabulary. Now lets use 
#    word embeddings in the neural net. The index 0 is reserved and is not assigned to any
#    word. 

# More imports:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(documents_train)

X_train = tokenizer.texts_to_sequences(documents_train)
X_test = tokenizer.texts_to_sequences(documents_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Can print an example here
#print(documents_train[2])
#print(X_train[2])

In [None]:
# This X_train created by tokenizer is different than the one created by CountVectorizer. The
#    one created by CountVectorizer is a vector for each entry that is the length of the entire
#    vocabulary, where as this X_train will be a vector equal to the length of each text, and the
#    numbers in vectors of this new X_train correspond to word values from the dictionary 
#    tokenizer.word_index. 

In [None]:
# Now because the lengths of our new X_train is all different we need some way to normalize them.
#    One way to solve this issue is to pad the vectors smaller than some max length with a zero. 

from keras.preprocessing.sequence import pad_sequences

maxlen_train = len(max(documents_train).split(' '))
maxlen_test = len(max(documents_test).split(' '))
maxlen = max(maxlen_train, maxlen_test)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
# At this point, our data is still hardcoded. We have not told Keras to learn a new 
#    embedding space through successive tasks. Now you can use the Embedding Layer of 
#    Keras which takes the previously calculated integers and maps them to a dense 
#    vector of the embedding. 

from keras.models import Sequential
from keras import layers

# This will be the size of the dense vector (the word embedding) that we are creating.
embedding_dim = 50

# Now lets set up the model layers:
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(48, activation='relu'))
model.add(layers.Dense(12, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# Train the model, print out the accuracy scores, and graph the results using the history. 
history = model.fit(X_train, categorical_train,
                    epochs=30,
                    verbose=False,
                    validation_data=(X_test, categorical_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, categorical_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, categorical_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# Now lets try it again but add a pooling layer to the neural network. 
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(48, activation='relu'))
model.add(layers.Dense(12, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# Train the model, print out the accuracy scores, and graph the results using the history. 
history = model.fit(X_train, categorical_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, categorical_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, categorical_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, categorical_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# Now lets use precomputed word embeddings in our neural net. 
import numpy as np

# We will use this function to retrieve the embedding matrix for the words from our documents. 
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
# Now lets actually create our matrix using the tokenizer we already set up. 

embedding_dim = 50

embedding_matrix = create_embedding_matrix('glove.6B.100d.txt', tokenizer.word_index, embedding_dim)

In [None]:
# Lets see how many of the embedding vectors are nonzero, which is how well the pretrained 
#    vocabulary covers our corpus vocabulary. 
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

In [None]:
# Now we will use the word embeddings in the 
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(vocab_size,embedding_dim, 
                           weights=[embedding_matrix],
                           input_length=maxlen,
                           trainable=False))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(36, activation='relu'))
model.add(layers.Dense(12, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# Train the model, print out the accuracy scores, and graph the results using the history. 
history = model.fit(X_train, categorical_train,
                    epochs=60,
                    verbose=False,
                    validation_data=(X_test, categorical_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, categorical_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, categorical_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# This will be the size of the dense vector (the word embedding) that we are creating.
embedding_dim = 50

# Now lets set up the model layers:
model = Sequential()
model.add(layers.Embedding(vocab_size,embedding_dim, 
                           weights=[embedding_matrix],
                           input_length=maxlen,
                           trainable=True))
model.add(layers.Flatten())
model.add(layers.Dense(38, activation='relu'))
model.add(layers.Dense(24, activation='sigmoid'))
model.add(layers.Dense(12, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# Train the model, print out the accuracy scores, and graph the results using the history. 
history = model.fit(X_train, categorical_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, categorical_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, categorical_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, categorical_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# Now try adding a convolutional layer to our neural net and see how
#    that affects the score. 

embedding_dim = 100

# Now lets set up the model layers:
model = Sequential()
model.add(layers.Embedding(vocab_size,embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(36, activation='relu'))
#model.add(layers.Dense(24, activation='relu'))
model.add(layers.Dense(12, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# Train the model, print out the accuracy scores, and graph the results using the history. 
history = model.fit(X_train, categorical_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, categorical_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, categorical_train, verbose=False)
print("Training Accuracy: {:.6f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, categorical_test, verbose=False)
print("Testing Accuracy:  {:.6f}".format(accuracy))
plot_history(history)

In [None]:
hist_df = pd.DataFrame(history.history) 
max(hist_df['val_accuracy'])