In [39]:
import numpy as np
import csv
import keras
import sklearn
import gensim
import random
import scipy

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers import Embedding, LSTM

from sklearn import preprocessing
from sklearn.base import BaseEstimator
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB

from gensim.models.word2vec import Word2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# from word_movers_knn import WordMoversKNN

# size of the word embeddings
embeddings_dim = 300

# maximum number of words to consider in the representations
max_features = 30000

# maximum length of a sentence
max_sent_len = 50

# percentage of the data used for model training
percent = 0.75

# number of classes
num_classes = 2

In [40]:
resources_base_path = "/Users/dsbatista/resources/"

print ("Reading pre-trained word embeddings...")
embeddings = KeyedVectors.load_word2vec_format(
    resources_base_path+"GoogleNews-vectors-negative300.bin.gz",
    binary=True)

Reading pre-trained word embeddings...


In [41]:
embeddings.vector_size

300

In [71]:
# dataset description:
# - sentences labelled with positive or negative sentiment, extracted from reviews of products, 
#   movies, and restaurants
# - download from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

data_base_path = "sentiment_labelled_sentences/"

print("Reading text data for classification and building representations ...")
amazon_data = [row for row in csv.reader(open(data_base_path+"amazon_cells_labelled.txt"), delimiter='\t')]
random.shuffle(amazon_data)

imdb_data = [row for row in csv.reader(open(data_base_path+"imdb_labelled.txt"), delimiter='\t')]
random.shuffle(imdb_data)

yelp_data = [row for row in csv.reader(open(data_base_path+"yelp_labelled.txt"), delimiter='\t')]
random.shuffle(yelp_data)

Reading text data for classification and building representations ...


In [72]:
print("amazon_data: ", len(amazon_data))
print("imdb_data  : ", len(imdb_data))
print("yelp_data  : ", len(yelp_data))

amazon_data:  1000
imdb_data  :  748
yelp_data  :  1000


In [73]:
data = amazon_data

## creat training/testing splits

In [74]:
train_size = int(len(data) * percent)

train_texts = [txt.lower().strip() for (txt,label) in data[0:train_size]]
test_texts = [txt.lower().strip() for (txt,label) in data[train_size:-1]]

train_labels = [label for (txt, label) in data[0:train_size]]
test_labels = [label for (txt, label) in data[train_size:-1]]

num_classes = len(set(train_labels + test_labels))

### data preparation: tokenization and padding

In [75]:
tokenizer = Tokenizer(num_words=max_features, lower=True, split=" ")
tokenizer.fit_on_texts(train_texts)

In [76]:
train_sequences = sequence.pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_sent_len)
test_sequences = sequence.pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=max_sent_len)

In [77]:
train_matrix = tokenizer.texts_to_matrix(train_texts)
test_matrix = tokenizer.texts_to_matrix(test_texts)

In [78]:
embedding_weights = np.zeros((max_features, embeddings_dim))

for word,index in tokenizer.word_index.items():
    if index < max_features:
        try: 
            embedding_weights[index,:] = embeddings[word]
        except: 
            embedding_weights[index,:] = np.random.rand( 1 , embeddings_dim )

le = preprocessing.LabelEncoder( )
le.fit(train_labels + test_labels)
train_labels = le.transform( train_labels )
test_labels = le.transform( test_labels )
print("Classes that are considered in the problem : " + repr( le.classes_ ))

Classes that are considered in the problem : array(['0', '1'],
      dtype='<U1')


### Naive Bayes with bag-of-words features

In [81]:
model = MultinomialNB( )
model.fit( train_matrix , train_labels )
results = model.predict( test_matrix )
print("Accuracy = " + repr( sklearn.metrics.accuracy_score(test_labels,results)))
print()
print(sklearn.metrics.classification_report( test_labels , results ))

Accuracy = 0.79919678714859432

             precision    recall  f1-score   support

          0       0.80      0.80      0.80       124
          1       0.80      0.80      0.80       125

avg / total       0.80      0.80      0.80       249



### Linear SVM with bag-of-words features

In [88]:
model = LinearSVC( random_state=0 )
model.fit( train_matrix , train_labels )
results = model.predict( test_matrix )
print(sklearn.metrics.classification_report(test_labels,results))
print("Accuracy = " + repr(sklearn.metrics.accuracy_score(test_labels,results)))

             precision    recall  f1-score   support

          0       0.80      0.86      0.83       124
          1       0.85      0.78      0.82       125

avg / total       0.83      0.82      0.82       249

Accuracy = 0.82329317269076308


### NB-SVM with bag-of-words features

In [89]:
model = MultinomialNB(fit_prior=False)
model.fit( train_matrix,train_labels)
train_matrix = np.hstack((train_matrix, model.predict_proba(train_matrix)))
test_matrix = np.hstack((test_matrix, model.predict_proba(test_matrix)))
model = LinearSVC( random_state=0 )
model.fit( train_matrix , train_labels )
results = model.predict( test_matrix )
train_matrix = train_matrix[0: train_matrix.shape[0], 0: train_matrix.shape[1] - model.intercept_.shape[0] ]
test_matrix = test_matrix[0: train_matrix.shape[0], 0: test_matrix.shape[1] - model.intercept_.shape[0] ]
print(sklearn.metrics.classification_report(test_labels,results))
print("Accuracy = " + repr(sklearn.metrics.accuracy_score(test_labels,results)))

             precision    recall  f1-score   support

          0       0.80      0.86      0.83       124
          1       0.85      0.78      0.82       125

avg / total       0.83      0.82      0.82       249

Accuracy = 0.82329317269076308


In [None]:
print ("Method = KNN with word mover's distance as described in 'From Word Embeddings To Document Distances'")
model = WordMoversKNN(W_embed=embedding_weights , n_neighbors=3)
model.fit( train_matrix , train_labels )
results = model.predict( test_matrix )
print(sklearn.metrics.classification_report(test_labels,results))
print("Accuracy = " + repr(sklearn.metrics.accuracy_score(test_labels,results)))

### MLP with bag-of-words features

In [103]:
np.random.seed(0)
model = Sequential()
model.add(Dense(embeddings_dim, input_dim=train_matrix.shape[1], kernel_initializer='uniform', activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(embeddings_dim, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))

if num_classes == 2: 
    model.compile(loss='binary_crossentropy', optimizer='adam', class_mode='binary')
else: 
    model.compile(loss='categorical_crossentropy', optimizer='adam')

In [116]:
history = model.fit(x=train_matrix, y=train_labels, epochs=10, batch_size=32, validation_split=0.2)

Train on 600 samples, validate on 150 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [115]:
results = model.predict_classes(test_matrix)
print()
print(sklearn.metrics.classification_report(test_labels,results))
print("Accuracy = " + repr(sklearn.metrics.accuracy_score(test_labels,results)))

             precision    recall  f1-score   support

          0       0.80      0.86      0.83       124
          1       0.85      0.78      0.82       125

avg / total       0.83      0.82      0.82       249

Accuracy = 0.82329317269076308


## Stack of two LSTMs

In [119]:
np.random.seed(0)
model = Sequential()

model.add(Embedding(max_features, embeddings_dim, input_length=max_sent_len, 
                    mask_zero=True, weights=[embedding_weights]))
model.add(Dropout(0.25))
model.add(LSTM(output_dim=embeddings_dim , 
               activation='sigmoid', inner_activation='hard_sigmoid', return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(output_dim=embeddings_dim , activation='sigmoid', inner_activation='hard_sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(1))
model.add(Activation('sigmoid'))

if num_classes == 2: 
    model.compile(loss='binary_crossentropy', optimizer='adam', class_mode='binary')
else: 
    model.compile(loss='categorical_crossentropy', optimizer='adam')  

model.fit( train_sequences , train_labels , epochs=30, batch_size=32)
results = model.predict_classes( test_sequences )
print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results )  ))
print (sklearn.metrics.classification_report( test_labels , results ))

  
  # Remove the CWD from sys.path while we load stuff.
kwargs passed to function are ignored with Tensorflow backend


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy = 0.82730923694779113
             precision    recall  f1-score   support

          0       0.85      0.80      0.82       124
          1       0.81      0.86      0.83       125

avg / total       0.83      0.83      0.83       249



## CNN from the paper: _Convolutional Neural Networks for Sentence Classification_

In [121]:
np.random.seed(0)
nb_filter = embeddings_dim

model = Graph()

model.add_input(name='input', input_shape=(max_sent_len,), dtype=int)

model.add_node(Embedding(max_features, embeddings_dim, input_length=max_sent_len, mask_zero=False, 
                         weights=[embedding_weights] ), name='embedding', input='input')

model.add_node(Dropout(0.25), name='dropout_embedding', input='embedding')


for n_gram in [3, 5, 7]:
    model.add_node(Convolution1D(nb_filter=nb_filter, filter_length=n_gram, border_mode='valid', activation='relu', subsample_length=1, input_dim=embeddings_dim, input_length=max_sent_len), name='conv_' + str(n_gram), input='dropout_embedding')
    model.add_node(MaxPooling1D(pool_length=max_sent_len - n_gram + 1), name='maxpool_' + str(n_gram), input='conv_' + str(n_gram))
    model.add_node(Flatten(), name='flat_' + str(n_gram), input='maxpool_' + str(n_gram))
model.add_node(Dropout(0.25), name='dropout', inputs=['flat_' + str(n) for n in [3, 5, 7]])
model.add_node(Dense(1, input_dim=nb_filter * len([3, 5, 7])), name='dense', input='dropout')
model.add_node(Activation('sigmoid'), name='sigmoid', input='dense')
model.add_output(name='output', input='sigmoid')

NameError: name 'Graph' is not defined

In [None]:
if num_classes == 2:
    model.compile(loss={'output': 'binary_crossentropy'}, optimizer='adam')
else: 
    model.compile(loss={'output': 'categorical_crossentropy'}, optimizer='adam') 

model.fit({'input': train_sequences, 'output': train_labels}, batch_size=32, nb_epoch=30)
results = np.array(model.predict({'input': test_sequences}, batch_size=32)['output'])

if num_classes != 2: 
    results = results.argmax(axis=-1)
else: 
    results = (results > 0.5).astype('int32')

print (sklearn.metrics.classification_report( test_labels , results ))
print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results )  ))

In [None]:
print ("Method = Bidirectional LSTM")
np.random.seed(0)
model = Graph()
model.add_input(name='input', input_shape=(max_sent_len,), dtype=int)
model.add_node(Embedding( max_features, embeddings_dim, input_length=max_sent_len, mask_zero=True, weights=[embedding_weights] ), name='embedding', input='input')
model.add_node(LSTM(embeddings_dim, activation='sigmoid', inner_activation='hard_sigmoid', return_sequences=True), name='forward1', input='embedding')
model.add_node(Dropout(0.25), name="dropout1", input='forward1')
model.add_node(LSTM(embeddings_dim, activation='sigmoid', inner_activation='hard_sigmoid'), name='forward2', input='forward1')
model.add_node(LSTM(embeddings_dim, activation='sigmoid', inner_activation='hard_sigmoid', go_backwards=True, return_sequences=True), name='backward1', input='embedding')
model.add_node(Dropout(0.25), name="dropout2", input='backward1') 
model.add_node(LSTM(embeddings_dim, activation='sigmoid', inner_activation='hard_sigmoid', go_backwards=True), name='backward2', input='backward1')
model.add_node(Dropout(0.25), name='dropout', inputs=['forward2', 'backward2'])
model.add_node(Dense(1, activation='sigmoid'), name='sigmoid', input='dropout')
model.add_output(name='output', input='sigmoid')
if num_classes == 2: model.compile(loss={'output': 'binary_crossentropy'}, optimizer='adam')
else: model.compile(loss={'output': 'categorical_crossentropy'}, optimizer='adam')
model.fit({'input': train_sequences, 'output': train_labels}, batch_size=32, nb_epoch=30)
results = np.array(model.predict({'input': test_sequences}, batch_size=32)['output'])
if num_classes != 2: results = results.argmax(axis=-1)
else: results = (results > 0.5).astype('int32')
print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results )  ))
print (sklearn.metrics.classification_report( test_labels , results ))

print ("Method = CNN-LSTM")
np.random.seed(0)
filter_length = 3
nb_filter = embeddings_dim
pool_length = 2
model = Sequential()
model.add(Embedding(max_features, embeddings_dim, input_length=max_sent_len, weights=[embedding_weights]))
model.add(Dropout(0.25))
model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='valid', activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=pool_length))
model.add(LSTM(embeddings_dim))
model.add(Dense(1))
model.add(Activation('sigmoid'))
if num_classes == 2: model.compile(loss='binary_crossentropy', optimizer='adam', class_mode='binary')
else: model.compile(loss='categorical_crossentropy', optimizer='adam')  
model.fit( train_sequences , train_labels , nb_epoch=30, batch_size=32)
results = model.predict_classes( test_sequences )
print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results ) ) )
print (sklearn.metrics.classification_report( test_labels , results ))

In [None]:
print ("Method = Linear SVM with doc2vec features")
np.random.seed(0)
class LabeledLineSentence(object):
  def __init__(self, data ): self.data = data
  def __iter__(self):
    for uid, line in enumerate( self.data ): yield TaggedDocument( line.split(" ") , ["S_%s" % uid] )
model = Doc2Vec( alpha=0.025 , min_alpha=0.025 )
sentences = LabeledLineSentence( train_texts + test_texts )
model.build_vocab( sentences )
model.train( sentences )
for w in model.vocab.keys():
  try: model[w] = embeddings[w] 
  except : continue
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
train_rep = np.array( [ model.docvecs[i] for i in range( train_matrix.shape[0] ) ] )
test_rep = np.array( [ model.docvecs[i + train_matrix.shape[0]] for i in range( test_matrix.shape[0] ) ] )
model = LinearSVC( random_state=0 )
model.fit( train_rep , train_labels )
results = model.predict( test_rep )
print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results )  ))
print (sklearn.metrics.classification_report( test_labels , results ))

In [None]:
print ("Method = Non-linear SVM with doc2vec features")
np.random.seed(0)
class LabeledLineSentence(object):
  def __init__(self, data ): self.data = data
  def __iter__(self):
    for uid, line in enumerate( self.data ): yield TaggedDocument( line.split(" ") , ["S_%s" % uid] )
model = Doc2Vec( alpha=0.025 , min_alpha=0.025 )
sentences = LabeledLineSentence( train_texts + test_texts )
model.build_vocab( sentences )
model.train( sentences )
for w in model.vocab.keys():
  try: model[w] = embeddings[w] 
  except : continue
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
train_rep = np.array( [ model.docvecs[i] for i in range( train_matrix.shape[0] ) ] )
test_rep = np.array( [ model.docvecs[i + train_matrix.shape[0]] for i in range( test_matrix.shape[0] ) ] )
model = SVC( random_state=0 , kernel='poly' )
model.fit( train_rep , train_labels )
results = model.predict( test_rep )
print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results )  ))
print (sklearn.metrics.classification_report( test_labels , results ))

In [None]:
print ("Method = MLP with doc2vec features")
np.random.seed(0)
class LabeledLineSentence(object):
  def __init__(self, data ): self.data = data
  def __iter__(self):
    for uid, line in enumerate( self.data ): yield TaggedDocument( line.split(" ") , ["S_%s" % uid] )
model = Doc2Vec( alpha=0.025 , min_alpha=0.025 )
sentences = train_texts + test_texts
sentences = LabeledLineSentence( sentences )
model.build_vocab( sentences )
model.train( sentences )
for w in model.vocab.keys():
  try: model[w] = embeddings[w]
  except : continue
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
train_rep = np.array( [ model.docvecs[i] for i in range( train_matrix.shape[0] ) ] )
test_rep = np.array( [ model.docvecs[i + train_matrix.shape[0]] for i in range( test_matrix.shape[0] ) ] )
model = Sequential()
model.add(Dense(embeddings_dim, input_dim=train_rep.shape[1], init='uniform', activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(embeddings_dim, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))
if num_classes == 2: model.compile(loss='binary_crossentropy', optimizer='adam', class_mode='binary')
else: model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit( train_rep , train_labels , nb_epoch=30, batch_size=32)
results = model.predict_classes( test_rep )
print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results )  ))
print (sklearn.metrics.classification_report( test_labels , results ))