In [None]:
#CSCE723 Final Project Code

#Author: Joshua White
#Sources: 
#https://realpython.com/python-keras-text-classification/

In [None]:
# Imports:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
# Variable setup:
train_filepath = "training_set_0.csv"
test_filepath = "test_set_0.csv"

In [None]:
# Load the pandas dataframes
df_train = pd.read_csv(train_filepath)
df_test = pd.read_csv(test_filepath)


In [None]:
print(df_train.iloc[0])

In [None]:
# The .values takes a df series and turns it into a numpy array. 
documents_train = df_train['processed_text'].values
categories_train = df_train['category'].values

documents_test = df_test['processed_text'].values
categories_test = df_test['category'].values

In [None]:
# Set up the SciKit Learn vectorizer to turn the documents into a sparse matrix. 
vectorizer = CountVectorizer()
vectorizer.fit(documents_train)

X_train = vectorizer.transform(documents_train)
X_test = vectorizer.transform(documents_test)
X_train

In [None]:
# Now lets create a logistic regression model, fit it, and get a score. 
# Note: had to raise max_iter because the fit was not converging. 
#    Default max_iter is 100. 
classifier = LogisticRegression(max_iter = 500)
classifier.fit(X_train, categories_train)
score = classifier.score(X_test, categories_test)

print("Accuracy:", score)

In [None]:
# Now that we have the baseline model above we need to modify the categories_train & categories_test
#    to work with the keras Sequential model. Right now each entry is just the integer of the class 
#    when it should be a one hot vector. So lets encode the output variable to make it work with keras. 
# Source:
# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
encoder = LabelEncoder()
encoder.fit(categories_train)
encoded_train = encoder.transform(categories_train)
encoded_test = encoder.transform(categories_test)
categorical_train = np_utils.to_categorical(encoded_train)
categorical_test = np_utils.to_categorical(encoded_test)

In [None]:
categorical_train

In [None]:
input_dim = X_train.shape[1]

model = Sequential()
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(36, input_dim=input_dim, activation='relu'))
#model.add(layers.Dense(24, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(12, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, categorical_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, categorical_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, categorical_train, verbose=False)
print("Training Accuracy: {:.6f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, categorical_test)
print("Testing Accuracy:  {:.6f}".format(accuracy))

In [None]:
# We can use this function to graph the accuracy and loss for the training
#    and test data based on the history callback (which is the output of 
#    the fit() method of keras).

plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
print(history.history.keys())

In [None]:
plot_history(history)