In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
# fix random seed for reproducibility
np.random.seed(7)
%matplotlib inline

In [49]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data()

In [50]:
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [4]:
embedding_vector_length = 64
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=1,
                              verbose=0, mode='auto')
callbacks_list = [early_stopping]
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=32, callbacks=callbacks_list)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 64)           320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 150)               129000    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 151       
Total params: 449,151
Trainable params: 449,151
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x124095780>

## Working with Original Data Format

In [53]:
data = pd.read_csv('stanford_movie_data.csv')
data.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [54]:
X_train = data['review'].iloc[0:25000]
y_train = data['sentiment'].iloc[0:25000]

X_test = data['review'].iloc[25000:]
y_test = data['sentiment'].iloc[25000:]

In [55]:
len(set(imdb.get_word_index().values())) == len(imdb.get_word_index().values())

True

In [56]:
import re # regex library
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # Effectively removes HTML markup tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [57]:
data['review'] = data['review'].apply(preprocessor)

In [58]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['review'])

In [59]:
len(tokenizer.word_index)

103893

In [60]:
def text_to_int_sequence(text):
    return [tokenizer.word_index[word] for word in text_to_word_sequence(text)]

In [61]:
X_train = X_train.apply(text_to_int_sequence)

In [62]:
X_test = X_test.apply(text_to_int_sequence)

In [64]:
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [68]:
embedding_vector_length = 64
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 10, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=2,
                              verbose=0, mode='auto')
callbacks_list = [early_stopping]
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=32, callbacks=callbacks_list)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 64)           6649792   
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 6,715,893
Trainable params: 6,715,893
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14b43cba8>

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin

class LSTM_Sentiment_Classifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, embedding_vector_length, max_seq_length, lstm_layers, num_epochs=3):
        
        self.embedding_vector_length = embedding_vector_length
        self.max_seq_length = max_seq_length
        self.lstm_layers = lstm_layers
        self.num_epochs=3
        self.tokenizer = Tokenizer()
        
    def _text_to_int_sequence(text):
        return [self.tokenizer.word_index[word] for word in text_to_word_sequence(text)]
        
    def fit(self, X, y, validation_data):
        
        self.tokenizer.fit(X)
        
        
        