In [1]:
import os
import glob
data_dir='./data/aclImdb/'
trainX,testX, trainy,testy=[],[],[],[]
for data_type in ['train','test']:
    for sentiment in ['pos','neg']:
        path=os.path.join(data_dir,data_type,sentiment,'*.txt')
        files=glob.glob(path)
        
        for f in files:
            with open(f,encoding='utf8') as review:
                if(data_type=='train'):
                    trainX.append(review.read())
                    trainy.append(1 if sentiment=='pos' else 0)
                else:
                    testX.append(review.read())
                    testy.append(1 if sentiment=='pos' else 0)

In [2]:
print('Training data of size {}'.format(len(trainX)))
print('Testing data of size {}'.format(len(testX)))

Training data of size 25000
Testing data of size 25000


In [3]:
max_features = 20000
# cut texts after this number of words (among top max_features most common words)
maxlen = 80
batch_size = 32

In [4]:
from sklearn.utils import shuffle

trainX,trainy=shuffle(trainX,trainy)
testX,testy=shuffle(testX,testy)

In [5]:
print('Training data of size {}'.format(len(trainX)))
print('Testing data of size {}'.format(len(testX)))

Training data of size 25000
Testing data of size 25000


In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [7]:
import pickle

cache_dir='./cache'
cache_file='preprocessed_data.pkl'

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

cache_data=None

try:
    with open(os.path.join(cache_dir,cache_file),'rb') as f:
        cache_data=pickle.load(f)
    print('Read data from cache')
except:
    print('Have to preprocess the data')
    pass

Read data from cache


In [8]:
if cache_data is None:
    train_data=[review_to_words(review) for review in trainX]
    test_data=[review_to_words(review) for review in testX]
    
    train_label=trainy
    test_label=trainy
    
    trainX=testX=trainy=testy=None
    
    cache_data=dict(train_data=train_data,
                   test_data=test_data,
                   train_label=train_label,
                   test_label=test_label)
    
    with open(os.path.join(cache_dir,cache_file),'wb') as f:
        pickle.dump(cache_data,f)
    
else:
    train_data=cache_data['train_data']
    test_data=cache_data['test_data']
    train_label=cache_data['train_label']
    test_label=cache_data['test_label']

In [None]:
import numpy as np
from collections import Counter
vocab_size=20000
flattened_data=[y for x in train_data for y in x]

word_count=Counter(flattened_data)

sorted_words=[word for word, _ in word_count.most_common()]

word_dict={}

for idx, data in enumerate(sorted_words[:vocab_size-2]):
    word_dict[data]=idx+2

In [None]:
import numpy as np
def convert_and_pad_data(word_dict,sentence,padding=500):
    working_sentence=np.zeros(padding)
    
    for idx, word in enumerate(sentence[:padding]):
        if word in word_dict:
            working_sentence[idx]=word_dict[word]
        else:
            working_sentence[idx]=1
    return np.array(working_sentence)
            

In [None]:
import pandas as pd
for idx,sentence in enumerate(train_data):
    train_data[idx]=convert_and_pad_data(word_dict,train_data[idx],maxlen)


for idx,sentence in enumerate(test_data):
    test_data[idx]=convert_and_pad_data(word_dict,test_data[idx],maxlen)

train_data=pd.DataFrame(train_data)
test_data=pd.DataFrame(test_data)


In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense ,LSTM ,Input

In [None]:
model=Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(train_data, train_label,
          batch_size=batch_size,
          epochs=15,
          validation_data=(test_data, test_label))

Train on 25000 samples, validate on 25000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/15
Epoch 2/15
Epoch 3/15

In [None]:
model.save('kerasimdb.h5')