In [0]:
import numpy as np
import pandas as pd
import logging
from keras import Sequential
from keras.layers import Dense, Dropout, LSTM , Embedding, Bidirectional, GlobalMaxPool1D
from keras.optimizers import Adam, SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from dictionary import Dictionary
from text_util import Text_Util

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
tfidf = False

In [29]:
# Get IMDB train, validation, and test sets and vectorize them
imdb_train = pd.read_csv('./train/pos.csv', names=['review', 'c'])
imdb_train = imdb_train.append(pd.read_csv('./train/neg.csv', names=['review', 'c']))
imdb_x_train = imdb_train.review.values
imdb_y_train = imdb_train.c.values
imdb_test = pd.read_csv('./test/pos.csv', names=['review', 'c'])
imdb_test = imdb_test.append(pd.read_csv('./test/neg.csv', names=['review', 'c']))
imdb_x_test = imdb_test.review.values
imdb_y_test = imdb_test.c.values

# *********************************************************************************************
logger.info('### Creating dictionary...')
sorted_labels = np.unique(imdb_y_train)
dic = Dictionary(sorted_labels)
# Preprocessing text
logger.info('### Preprocessing text...')
text_util = Text_Util()
imdb_x_train = text_util.get_preprocessed_tokenized_sentences(imdb_x_train)
if tfidf:
  # Updating dictionary
  logger.info('### Updating dictionary...')
  for i in range(len(imdb_x_train)):
      dic.update_tokenized(imdb_x_train[i], imdb_y_train[i])
  selected_words = {}
  for i, l in enumerate(sorted_labels):
      u_list = dic.get_n_words_unique_to_label(l, 1000)
      o_list = dic.get_n_top_words_given_label(l, 5000)
      for u in u_list:
          selected_words[u] = True

      for o in o_list:
          selected_words[o] = True

  print(f"Size of selected words set {len(selected_words)}")
  x = []

  for tokenized_comment in imdb_x_train:
      x.append(np.array([w for w in tokenized_comment if w in selected_words]))
  imdb_x_train = x
  imdb_x_train = list(map(" ".join, imdb_x_train))
  # *********************************************************************************************

  vectorizer = TfidfVectorizer()
  imdb_x_train = vectorizer.fit_transform(imdb_x_train)
  imdb_x_test = text_util.get_preprocessed_tokenized_sentences(imdb_x_test)
  imdb_x_test = list(map(" ".join, imdb_x_test))
  imdb_x_test = vectorizer.transform(imdb_x_test)
  imdb_x_train, imdb_x_val, imdb_y_train, imdb_y_val = \
      train_test_split(imdb_x_train, imdb_y_train, test_size=.2, shuffle=True, stratify=imdb_y_train)
  logger.info(f'### imdb_x_train.shape {imdb_x_train.shape}')
  logger.info(f'### imdb_x_val.shape {imdb_x_val.shape}')
else:
  logger.info('### Preprocessing for word2vec embeddings...')
  imdb_x_train = list(map(" ".join, imdb_x_train))
  imdb_x_test = text_util.get_preprocessed_tokenized_sentences(imdb_x_test)
  imdb_x_test = list(map(" ".join, imdb_x_test))
  max_words = 6000
  tokenizer = Tokenizer(num_words=max_words)
  tokenizer.fit_on_texts(imdb_x_train)
  imdb_x_train = tokenizer.texts_to_sequences(imdb_x_train)
  imdb_x_test = tokenizer.texts_to_sequences(imdb_x_test)
  max_len = 130
  imdb_x_train = pad_sequences(imdb_x_train, maxlen=max_len)
  imdb_x_test = pad_sequences(imdb_x_test, maxlen=max_len)
  imdb_x_train, imdb_x_val, imdb_y_train, imdb_y_val = \
      train_test_split(imdb_x_train, imdb_y_train, test_size=.2, shuffle=True, stratify=imdb_y_train)
  logger.info(f'### imdb_x_train.shape {imdb_x_train.shape}')
  logger.info(f'### imdb_x_val.shape {imdb_x_val.shape}')

INFO:__main__:### Creating dictionary...
INFO:__main__:### Preprocessing text...
INFO:__main__:### Preprocessing for word2vec embeddings...
INFO:__main__:### imdb_x_train.shape (20000, 130)
INFO:__main__:### imdb_x_val.shape (5000, 130)


In [34]:
if tfidf:
  # Train model
  classifier = Sequential()
  # First Hidden Layer
  classifier.add(Dense(32, activation='relu', kernel_initializer='random_normal', input_dim=imdb_x_train.shape[1]))
  # Dropout
  classifier.add(Dropout(rate=0.1))
  # Second Hidden Layer
  classifier.add(Dense(64, activation='relu', kernel_initializer='random_normal'))
  # Dropout
  classifier.add(Dropout(rate=0.1))
  # Output Layer
  classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
  opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
  # opt = SGD(lr=0.01, momentum=0.0, nesterov=False)
  classifier.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
  # Fitting the data to the training dataset
  classifier.fit(imdb_x_train, imdb_y_train, batch_size=64, epochs=4, validation_data=(imdb_x_val, imdb_y_val))
else:
  embed_size = 128
  classifier = Sequential()
  classifier.add(Embedding(max_words, embed_size))
  classifier.add(Bidirectional(LSTM(32, return_sequences = True)))
  classifier.add(GlobalMaxPool1D())
  classifier.add(Dense(20, activation="relu"))
  classifier.add(Dropout(0.05))
  classifier.add(Dense(1, activation="sigmoid"))
  opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
  classifier.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
  batch_size = 64
  epochs = 1
  classifier.fit(imdb_x_train, imdb_y_train, batch_size=batch_size, epochs=epochs, validation_data=(imdb_x_val, imdb_y_val))

imdb_y_pred = classifier.predict(imdb_x_test)
imdb_y_pred = imdb_y_pred > 0.5
print(classification_report(imdb_y_test, imdb_y_pred))

Train on 20000 samples, validate on 5000 samples
Epoch 1/1
              precision    recall  f1-score   support

           0       0.83      0.91      0.87     12500
           1       0.90      0.82      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.87      0.86      0.86     25000
weighted avg       0.87      0.86      0.86     25000

