In [27]:
from __future__ import unicode_literals
import os
import nltk
import pandas as pd
import string
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout, Activation
import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
np.random.seed(1337)  # For Reproducibility
import multiprocessing
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from utils import *
import glob
from nltk.stem.porter import *
import string
import unicodedata
from joblib import Parallel, delayed
import tqdm
from keras.layers import Convolution1D, GlobalMaxPooling1D

In [2]:
# # reload necessary data set to pandas DataFrame.
# train_df = pd.read_csv('dumps/stemmered_train_mrd.csv', encoding='utf8')
# test_df = pd.read_csv('dumps/stemmered_test_mrd.csv', encoding='utf8')

In [3]:
def load_mrd_data():
    from sklearn.model_selection import train_test_split
    import io
    sentences_pos = []
    ff = "../rt-polaritydata/rt_polarity.pos.utf8.txt"
    with io.open(ff, 'r', encoding='utf8') as f:
        for line in f:
            sentences_pos.append(line)
    sentences_neg = []
    ff = "../rt-polaritydata/rt_polarity.neg.utf8.txt"
    with io.open(ff, 'r', encoding='utf8') as f:
        for line in f:
            sentences_neg.append(line)
    X = sentences_pos+sentences_neg
    y = [1]*len(sentences_pos)+[0]*len(sentences_neg)
    return X, y

In [4]:
X, y = load_mrd_data()

In [5]:
def stemmering_sentences(sentence):
    # Remove punctuation, stopword and then stemmering
    punctuation = set(string.punctuation)
    stemmer = nltk.PorterStemmer()
    # tmp = unicode(sentence, errors='ignore')
    tmp = sentence
    doc = [stemmer.stem(word.lower()) for word in nltk.word_tokenize(tmp) if
           (word not in punctuation) and (word not in nltk.corpus.stopwords.words('english')) and (word != 'br')]
    return doc

def stemmering_sentences_mrd(X, y):
    sentences_stem = Parallel(n_jobs=4)(delayed(stemmering_sentences)(sentence) for sentence in tqdm.tqdm(X, desc="stem"))
    return sentences_stem, y

In [6]:
X, y = stemmering_sentences_mrd(X, y)


stem:   0%|          | 0/10662 [00:00<?, ?it/s].59it/s]s]]]]

In [35]:
item_length = [len(i) for i in X]
import numpy as np
print np.max(item_length)
print np.mean(item_length)

39
11.0888201088


In [71]:
# set parameters:
vocab_dim = 32
maxlen = 40
n_iterations = 1  # ideally more..
# Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. 
n_exposures = 2
window_size = 5
batch_size = 100
nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 3
cpu_count = multiprocessing.cpu_count()

In [72]:
# combine_train_test_X = terms_by_doc_train + terms_by_doc_test
combine_train_test_X = X

In [73]:
print('Training a Word2vec model...')
model = Word2Vec(size=vocab_dim,
                 min_count=n_exposures,
                 window=window_size,
                 workers=cpu_count,
                 iter=n_iterations)
model.build_vocab(combine_train_test_X)
model.train(combine_train_test_X)

103261

Training a Word2vec model...


In [74]:
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
# gensim_dict.items() returns [(0, u"'surpris"), (1, u'woodi'), (2, u'yellow'),...]
# K+1 aims at avoiding 0 as index.
w2indx = {v: k+1 for k, v in gensim_dict.items()}
w2vec = {word: model[word] for word in w2indx.keys()}
# print len(model["surpris"]) -> 100
print('Setting up Arrays for Keras Embedding Layer...')
n_symbols = len(w2indx) + 1  # adding 1 to account for 0th index
embedding_weights = np.zeros((n_symbols + 1, vocab_dim))
for word, index in w2indx.items():
    embedding_weights[index, :] = w2vec[word]
# print embedding_weights.shape -> (11405, 100)

Setting up Arrays for Keras Embedding Layer...


In [75]:
from sklearn.model_selection import train_test_split
terms_by_doc_train, terms_by_doc_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=58)
X_train = []
for doc in terms_by_doc_train:
    new_txt = []
    for word in doc:
        try:
            new_txt.append(w2indx[word])
        except:
            new_txt.append(0)
    X_train.append(new_txt)
X_test = []
for doc in terms_by_doc_test:
    new_txt = []
    for word in doc:
        try:
            new_txt.append(w2indx[word])
        except:
            new_txt.append(0)
    X_test.append(new_txt)

In [76]:
print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)
print('X_train shape:',  y_train.shape)
print('X_test shape:', y_test.shape)

Pad sequences (samples x time)
(u'X_train shape:', (7463, 40))
(u'X_test shape:', (3199, 40))
(u'X_train shape:', (7463,))
(u'X_test shape:', (3199,))


In [77]:
print('Defining a Simple Keras Model...')
model = Sequential()  # or Graph or whatever

# we start off with an efficient embedding layer which maps
# our vocab indices into vocab_dim dimensions
model.add(Embedding(input_dim = n_symbols + 1,
                    output_dim = vocab_dim,
                    input_length=maxlen,
                    dropout=0.2))

# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          nb_epoch=nb_epoch,
          validation_data=(X_test, y_test))

print("Evaluate...")
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


(u'Test score:', 0.53518686599379672)
(u'Test accuracy:', 0.74460768448110293)
 200/3199 [>.............................] - ETA: 0s

In [78]:
from keras.models import load_model
model.save('./cnn_train_mrd.h5')
import pickle
with open("./cnn_w2indx_dict_mrd.pkl", 'wb') as f:
    pickle.dump(w2indx, f)

In [84]:
with open("./cnn_w2indx_dict_mrd.pkl", 'rb') as f:
    w2indx_load = pickle.load(f)
model = load_model('./cnn_train_mrd.h5')

In [90]:
demo = ["good, beautiful", "It is terrible, he doesn't like the film.", "I love this film"]
demo = [stemmering_sentences(i) for i in demo]
X_demo = []
for doc in demo:
    new_txt = []
    for word in doc:
        try:
            new_txt.append(w2indx_load[word])
        except:
            new_txt.append(0)
    X_demo.append(new_txt)

In [91]:
X_demo = sequence.pad_sequences(X_demo, maxlen=maxlen)
print model.predict(X_demo)

[[ 0.87933546]
 [ 0.3075977 ]
 [ 0.58295035]]
