# Parsing the imdb database

In [1]:
# Based on
# https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/6.1-using-word-embeddings.ipynb
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)

1.8.0


### Download data:
* original imdb database from http://ai.stanford.edu/~amaas/data/sentiment/

In [5]:
# !ls -l C:/Users/olive/Development/data/aclImdb

In [6]:
# !ls -l C:/Users/olive/Development/data/aclImdb/train

### Load Database

In [7]:
import os

imdb_dir = 'C:/Users/olive/Development/data/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding='UTF-8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [8]:
len(texts)

25000

In [9]:
texts[0]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

In [10]:
# 0: neg, 1: pos
labels[0]

0

In [11]:
texts[15000]

"Kate Beckinsale steals the show! Bravo! Too bad Knightly ins't as good looking as Jeremy Northam. Mark Strong did a fabulous job. Bernard Hepton was perfect as Emmas father. I love the end scene (which is an addition to the novel-but well written) when the harvest is in and Knightly dines with his workers and high society friends. Emma must show that she accepts this now. She is a changed woman. That is too much too quick, but OK. I'll buy into it. Samantha Bond plays Emma's ex-governess and confidant. She is wonderful. just as I would have imagined her. I believe that when the UK does a Jane Austen its the best. American versions of English literature are done for money and not for quality. See this one!"

In [12]:
labels[15000]

1

### Transform each review into exactly 500 words having a vocabulary of 10000 words

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 500  # We will cut reviews after 500 words
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

Using TensorFlow backend.


In [19]:
# Tokenizer?

In [22]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 "it's": 42,
 'out': 43,
 'has': 44,
 'if': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'she': 56,
 'even': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'only': 61,
 'story': 62,
 'really': 63,
 'see': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'were': 68,
 'me': 69,
 'well': 70,
 'than': 71,
 'we': 72,
 'much': 73,
 'been': 74,
 'bad': 75,
 'get': 76,
 'will': 77,
 'do': 78,
 'also': 79,
 'into': 80,
 'people': 81,
 'other': 82,
 '

In [17]:
# tokenizer.texts_to_matrix?

In [71]:
binary_martix = tokenizer.texts_to_matrix(texts, mode='binary')
binary_martix

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]])

In [74]:
len(binary_martix)

25000

In [75]:
len(binary_martix[0])

10000

In [72]:
count_matrix = tokenizer.texts_to_matrix(texts, mode='count')
count_matrix

array([[ 0.,  4.,  1., ...,  0.,  0.,  0.],
       [ 0., 53.,  0., ...,  0.,  0.,  0.],
       [ 0., 10.,  3., ...,  0.,  0.,  0.],
       ...,
       [ 0., 23.,  7., ...,  0.,  0.,  0.],
       [ 0.,  9.,  4., ...,  0.,  0.,  0.],
       [ 0.,  9.,  7., ...,  0.,  0.,  0.]])

In [77]:
len(count_matrix)

25000

In [79]:
len(binary_martix[0])

10000

In [81]:
tfidf_matrix = tokenizer.texts_to_matrix(texts, mode='tfidf')
tfidf_matrix

array([[0.        , 1.66394589, 0.71027668, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.4657488 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 2.30286883, 1.49059538, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 2.88365037, 2.09241129, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 2.2294017 , 1.69492924, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 2.2294017 , 2.09241129, ..., 0.        , 0.        ,
        0.        ]])

In [18]:
# tokenizer.texts_to_sequences?

In [24]:
sequences = tokenizer.texts_to_sequences(texts)

In [25]:
len(sequences)

25000

In [26]:
len(sequences[0])

104

In [27]:
sequences[0]

[62,
 4,
 3,
 129,
 34,
 44,
 7576,
 1414,
 15,
 3,
 4252,
 514,
 43,
 16,
 3,
 633,
 133,
 12,
 6,
 3,
 1301,
 459,
 4,
 1751,
 209,
 3,
 7693,
 308,
 6,
 676,
 80,
 32,
 2137,
 1110,
 3008,
 31,
 1,
 929,
 4,
 42,
 5120,
 469,
 9,
 2665,
 1751,
 1,
 223,
 55,
 16,
 54,
 828,
 1318,
 847,
 228,
 9,
 40,
 96,
 122,
 1484,
 57,
 145,
 36,
 1,
 996,
 141,
 27,
 676,
 122,
 1,
 411,
 59,
 94,
 2278,
 303,
 772,
 5,
 3,
 837,
 20,
 3,
 1755,
 646,
 42,
 125,
 71,
 22,
 235,
 101,
 16,
 46,
 49,
 624,
 31,
 702,
 84,
 702,
 378,
 3493,
 2,
 8422,
 67,
 27,
 107,
 3348]

In [28]:
data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (25000, 500)
Shape of label tensor: (25000,)


In [29]:
len(data[0])

500

In [30]:
# by convention zero is the empty word
data[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

### Split data into training and test sets and make sure to make this balanced

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [33]:
x_train.shape

(20000, 500)

In [34]:
y_train.shape

(20000,)

In [35]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([ 9985, 10015], dtype=int64))

In [36]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([2515, 2485], dtype=int64))

### Use Stratify to make it balanced

In [37]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)

In [38]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([10000, 10000], dtype=int64))

In [39]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([2500, 2500], dtype=int64))