In [7]:
### Simple Application of Keras to NLP ###

"""Below is a two-layer network. 

The first layer (which actually comes after an input layer) is called the hidden layer
and the second one is called the output layer. We need to specify the input dimension
(input_dim): we only have 1 unit in the output layer because we are dealing with a binary 
classification problem."""

'Below is a two-layer network. \n\nThe first layer (which actually comes after an input layer) is called the hidden layer\nand the second one is called the output layer. We need to specify the input dimension\n(input_dim): we only have 1 unit in the output layer because we are dealing with a binary \nclassification problem.'

In [2]:
# In NLP, we always start by cleaning the text or corpus. 
import re
import pandas as pd
from sklearn.model_selection import train_test_split
 
def clean_review(text):
    # Strip HTML tags
    text = re.sub('<[^<]+?>', ' ', text)
 
    # Strip escaped quotes
    text = text.replace('\\"', '')
 
    # Strip quotes
    text = text.replace('"', '')
 
    return text

# We load the labeledTrainData.tsv including sentiment and reviews
df = pd.read_csv('/Users/tonydiana/Downloads/labeledTrainData.tsv', sep='\t', quoting=3)
df['cleaned_review'] = df['review'].apply(clean_review)
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment'], test_size=0.2)

In [3]:
# CountVectorizer is used to convert a collection of text documents to a matrix of token counts.
# This is how we create a bag of words or BOW: The bag of words model (BoW model) is a reduced 
# and simplified representation of a text document from selected parts of the text, based on specific 
# criteria, such as word frequency.
# By understanding how words are positioned and the relative values of word, 
# we can pick out patterns in a corpus.

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
 
vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
X_train_onehot = vectorizer.fit_transform(X_train)

In [8]:
from keras.models import Sequential
from keras.layers import Dense
 
model = Sequential()
 
model.add(Dense(units=500, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Dense(units=1, activation='sigmoid'))
# Adam (adaptive moment estimation) is an adaptive learning rate optimization algorithm designed specifically 
# for training deep neural networks. Adam is an optimization algorithm that can be used 
# instead of the classical stochastic gradient descent procedure to update network weights 
# iterative based in training data. 

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 500)               2500500   
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 501       
Total params: 2,501,001
Trainable params: 2,501,001
Non-trainable params: 0
_________________________________________________________________


In [5]:
model.fit(X_train_onehot[:-100], y_train[:-100], 
          epochs=2, batch_size=128, verbose=1, 
          validation_data=(X_train_onehot[-100:], y_train[-100:]))

Train on 19900 samples, validate on 100 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x1a432d8e50>

In [6]:
scores = model.evaluate(vectorizer.transform(X_test), y_test, verbose=1)
print("Accuracy:", scores[1])  

Accuracy: 0.8679999709129333
