In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
# keras libraries
from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical
# text libraries
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import word2vec


In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
print(train_df.shape, test_df.shape)


In [None]:
train_df.head()


In [None]:
# check the class distribution for the author label in train_df?
train_df['author'].value_counts()


In [None]:
# compute the character length for the rows and record these
train_df['text_length'] = train_df['text'].str.len()


In [None]:
# look at the histogram plot for text length
train_df.hist()
plt.show()


In [None]:
# examine the text characters length in test_df and record these
test_df['text_length'] = test_df['text'].str.len()
test_df.hist()
plt.show()


In [None]:
# convert author labels into one-hot encodings
train_df['author'] = pd.Categorical(train_df['author'])
df_Dummies = pd.get_dummies(train_df['author'], prefix='author')
train_df = pd.concat([train_df, df_Dummies], axis=1)
# Check the conversion
train_df.head()


In [None]:
# helper to clean text and separate into words
def clean_text(text):
    text = text.lower()
    text = re.sub('\W', ' ', text)
    return text


In [None]:
# clean train_df['text']
train_df['text'] = train_df['text'].map(lambda com : clean_text(com))


In [None]:
# clean test_df['text']
test_df['text'] = test_df['text'].map(lambda com : clean_text(com))


In [None]:
X = train_df['text']
y = train_df[['author_EAP', 'author_HPL', 'author_MWS']]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=123)
print(X_train.shape, y_train.shape, X_dev.shape, y_dev.shape)


In [None]:
# examine the class distribution in y_train and y_dev
print(y_train.sum(axis=0),'\n', y_dev.sum(axis=0))


In [None]:
# import and instantiate CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = CountVectorizer()
# vect = CountVectorizer(lowercase=False, token_pattern=r'(?u)\b\w+\b')
# vect = CountVectorizer(lowercase=False, token_pattern=r'(?u)\b\w+\b|\,|\.|\;|\:')
# vect = CountVectorizer(lowercase=False, token_pattern=r'(?u)\b\w+\b|\,|\.|\?|\;|\:|\!|\'')
vect


In [None]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_train_dtm = vect.fit_transform(X_train)
# examine the document-term matrix created from X_train
X_train_dtm = X_train_dtm.toarray()
X_train_dtm


In [None]:
print(X_train_dtm.shape)


In [None]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
X_dev_dtm = vect.transform(X_dev)
# examine the document-term matrix from X_test
X_dev_dtm = X_dev_dtm.toarray()
X_dev_dtm


In [None]:
print(X_train_dtm.shape, y_train.shape)
print(X_dev_dtm.shape, y_dev.shape)


In [None]:
num_input_shape = X_train_dtm.shape[1]
num_class = y_train.shape[1]


In [None]:
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(num_input_shape,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(num_class, activation='softmax'))


In [None]:
model.summary()


In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
             metrics=['accuracy'])


In [None]:
history = model.fit(X_train_dtm, y_train, epochs=20, batch_size=512,
                    validation_data=(X_dev_dtm, y_dev))


In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss)+1)
plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
plt.clf()
acc = history.history['acc']
val_acc = history.history['val_acc']
epochs = range(1, len(acc)+1)
plt.plot(epochs, acc, 'bo', label='Training Accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(num_input_shape,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(num_class, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
             metrics=['accuracy'])

model.fit(X_train_dtm, y_train, epochs=3, batch_size=512,
          validation_data=(X_dev_dtm, y_dev))


In [None]:
results = model.evaluate(X_dev_dtm, y_dev)
print(results)


In [None]:
test = test_df['text']
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_dtm = vect.transform(test)
# examine the document-term matrix from X_test
test_dtm = test_dtm.toarray()
test_dtm


In [None]:
print(test_dtm.shape)


In [None]:
# make author (class) predictions for test_dtm
dnn_predictions = model.predict(test_dtm)
print(dnn_predictions.shape)


In [None]:
print(dnn_predictions[:10])


In [None]:
result = pd.DataFrame(dnn_predictions, columns=['EAP','HPL','MWS'])
result.insert(0, 'id', test_df['id'])
result.head()


In [None]:
# Generate submission file in csv format
result.to_csv('rhodium_submission_17.csv', index=False, float_format='%.20f')
