In [0]:
import pandas as pd
import keras

Using TensorFlow backend.


In [0]:
from google.colab import files
uploaded = files.upload()

Saving imdb-dataset-of-50k-movie-reviews.zip to imdb-dataset-of-50k-movie-reviews.zip


In [0]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [0]:
df = pd.read_csv("IMDB Dataset.csv")

In [0]:
df["review"] = df["review"].str.replace("<br />", "")

In [0]:
from tqdm import tqdm
import nltk
import numpy as np
import string
from sklearn.model_selection import train_test_split
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
flatten = lambda l: [item for sublist in l for item in sublist]

def ngrams(w,n):
  if len(w) < n:
    return [w]
  else:
    return [w[i:i+n] for i in range(len(w) - n + 1)]

def remove_punct(sent):
  new_sent = []
  for word in sent:
    if word not in string.punctuation:
      new_sent.append(word)
  return new_sent

def prepare_doc(doc):
  prepared_doc = []
  sentences = nltk.sent_tokenize(doc)
  for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    without_stop_words = [word.lower() for word in words]
    prepared_doc.extend(flatten([ngrams(word, 5) for word in remove_punct(without_stop_words)]))
  return prepared_doc

In [0]:
reviews = [prepare_doc(x) for x in tqdm(df["review"].tolist())]

100%|██████████| 50000/50000 [02:07<00:00, 393.21it/s]


In [0]:
reviews_len = []
for review in reviews:
  reviews_len.append(len(review))

In [0]:
reviews_len = np.array(reviews_len)

In [0]:
reviews_len.mean() + reviews_len.std()

724.6684853078232

In [0]:
def sentiment_to_label(x):
  if x == "positive":
    return 1.0
  else:
    return 0.0

In [0]:
labels = [sentiment_to_label(x) for x in df["sentiment"].tolist()]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.8, random_state=42)

In [0]:
vocab = {}
i = 1
for doc in tqdm(X_train):
  for token in doc:
    if token not in vocab.keys():
      vocab[token] = i
      i += 1

100%|██████████| 10000/10000 [00:00<00:00, 10314.18it/s]


In [0]:
def tokens_to_ids(tokens, vocab, input_len=1000):
  ids = []
  for token in tokens:
    if len(ids) >= input_len:
      break
    if token in vocab.keys():
      ids.append(vocab[token])
    else:
      ids.append(0)
  if len(ids) < input_len:
    ids.extend([0]* (input_len - len(ids)))
  return ids

In [0]:
X_train_ids = [tokens_to_ids(x, vocab) for x in tqdm(X_train)]

100%|██████████| 10000/10000 [00:02<00:00, 4880.49it/s]


In [0]:
X_train_ids = np.array(X_train_ids)

In [0]:
y_train = np.array(y_train).reshape(-1,1)

In [0]:
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.models import Sequential

In [0]:
model = Sequential()
model.add(Embedding(len(vocab)+1, 16, input_length=1000))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(128))
model.add(Dense(1))
model.compile(optimizer='adam', loss='binary_crossentropy')

In [0]:
model.fit(X_train_ids, y_train, batch_size=128)

Epoch 1/1


<keras.callbacks.History at 0x7f468376fa20>

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1000, 16)          1914336   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 1000, 256)         148480    
_________________________________________________________________
bidirectional_8 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense_7 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 129       
Total params: 2,490,081
Trainable params: 2,490,081
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.predict(X_train_ids[3].reshape(1,-1))

array([[0.9737596]], dtype=float32)

In [0]:
model_emb = Sequential()
model_emb.add(Embedding(len(vocab)+1, 16, input_length=1000))
model_emb.compile(optimizer='adam', loss='binary_crossentropy')

In [0]:
model_emb.layers[0].set_weights(model.layers[0].get_weights())

In [0]:
model_emb.predict(X_train_ids[0].reshape(1,-1))

array([[[-0.02312276,  0.0455244 , -0.00083274, ...,  0.01696844,
          0.05102143,  0.02526811],
        [ 0.00640948, -0.03931662, -0.01058051, ..., -0.02149201,
         -0.01518017,  0.04120579],
        [ 0.00143717,  0.03739033,  0.02504368, ...,  0.01039382,
         -0.03449389,  0.0270608 ],
        ...,
        [-0.00014157,  0.03968099,  0.00241019, ..., -0.02121228,
          0.03874608, -0.00407849],
        [-0.00014157,  0.03968099,  0.00241019, ..., -0.02121228,
          0.03874608, -0.00407849],
        [-0.00014157,  0.03968099,  0.00241019, ..., -0.02121228,
          0.03874608, -0.00407849]]], dtype=float32)