In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical


In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')


In [None]:
# check the class distribution for the author label in train_df?
train_df['author'].value_counts()


In [None]:
# compute the character length for the rows and record these
train_df['text_length'] = train_df['text'].str.len()


In [None]:
# look at the histogram plot for text length
train_df.hist()
plt.show()


In [None]:
# examine the text characters length in test_df and record these
test_df['text_length'] = test_df['text'].str.len()


In [None]:
test_df.hist()
plt.show()


In [None]:
# convert author labels into numerical variables
train_df['author_num'] = train_df.author.map({'EAP':0, 'HPL':1, 'MWS':2})
# Check conversion for first 5 rows
train_df.head()


In [None]:
train_df = train_df.rename(columns={'text':'original_text'})
train_df['text'] = train_df['original_text'].str[:700]
train_df['text_length'] = train_df['text'].str.len()


In [None]:
test_df = test_df.rename(columns={'text':'original_text'})
test_df['text'] = test_df['original_text'].str[:700]
test_df['text_length'] = test_df['text'].str.len()


In [None]:
X = train_df['text']
y = train_df['author_num']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


In [None]:
# examine the class distribution in y_train and y_test
print(y_train.value_counts(),'\n', y_test.value_counts())


In [None]:
# import and instantiate CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# vect = CountVectorizer()
# vect = CountVectorizer(lowercase=False, token_pattern=r'(?u)\b\w+\b')
vect = CountVectorizer(lowercase=False, token_pattern=r'(?u)\b\w+\b|\,|\.|\;|\:')
# vect = CountVectorizer(lowercase=False, token_pattern=r'(?u)\b\w+\b|\,|\.|\?|\;|\:|\!|\'')
vect


In [None]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_train_dtm = vect.fit_transform(X_train)
# examine the document-term matrix created from X_train
X_train_dtm = X_train_dtm.toarray()
X_train_dtm


In [None]:
onehot_y_train = to_categorical(y_train)
onehot_y_test = to_categorical(y_test)


In [None]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
X_test_dtm = vect.transform(X_test)
# examine the document-term matrix from X_test
X_test_dtm = X_test_dtm.toarray()
X_test_dtm


In [None]:
print(X_train_dtm.shape, onehot_y_train.shape)
print(X_test_dtm.shape, onehot_y_test.shape)


In [None]:
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(25149,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))


In [None]:
model.summary()


In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
             metrics=['accuracy'])


In [None]:
history = model.fit(X_train_dtm, onehot_y_train, epochs=20, batch_size=512,
                    validation_data=(X_test_dtm, onehot_y_test))


In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss)+1)
plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
plt.clf()
acc = history.history['acc']
val_acc = history.history['val_acc']
epochs = range(1, len(acc)+1)
plt.plot(epochs, acc, 'bo', label='Training Accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(25149,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
             metrics=['accuracy'])

model.fit(X_train_dtm, onehot_y_train, epochs=5, batch_size=512,
          validation_data=(X_test_dtm, onehot_y_test))


In [None]:
results = model.evaluate(X_test_dtm, onehot_y_test)
print(results)


In [None]:
# Learn the vocabulary in the entire training data, and create the document-term matrix
X_dtm = vect.fit_transform(X)
# Examine the document-term matrix created from X_train
X_dtm = X_dtm.toarray()
X_dtm


In [None]:
# One-hot encode the labels
onehot_y = to_categorical(y)

print(X_dtm.shape, onehot_y.shape)


In [None]:
# Train the DNN models onn entire training set using X_dtm and y

model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(27457,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
             metrics=['accuracy'])

model.fit(X_dtm, onehot_y, epochs=5, batch_size=512)


In [None]:
# check training accuracy

results = model.evaluate(X_dtm, onehot_y)
print(results)


In [None]:
test = test_df['text']
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_dtm = vect.transform(test)
# examine the document-term matrix from X_test
test_dtm = test_dtm.toarray()
test_dtm


In [None]:
print(test_dtm.shape)


In [None]:
# make author (class) predictions for test_dtm
dnn_predictions = model.predict(test_dtm)
print(dnn_predictions.shape)


In [None]:
print(dnn_predictions[:10])


In [None]:
result = pd.DataFrame(dnn_predictions, columns=['EAP','HPL','MWS'])
result.insert(0, 'id', test_df['id'])
result.head()


In [None]:
# Generate submission file in csv format
result.to_csv('rhodium_submission_17.csv', index=False, float_format='%.20f')
