# Word Tokenization

In [1]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences = ['I love my cat','I love my dog','I love my dog!', 'I have too many dogs','I','anticipate']

We saw "Teddy Roosevelt was a great President"

We saw "Teddy bears were at a discount"

In [3]:
tokenizer = Tokenizer(num_words = 10, oov_token='<UKW>')

In [4]:
tokenizer.fit_on_texts(sentences)

In [5]:
print(tokenizer.word_index)

{'<UKW>': 1, 'i': 2, 'love': 3, 'my': 4, 'dog': 5, 'cat': 6, 'have': 7, 'too': 8, 'many': 9, 'dogs': 10, 'anticipate': 11}


In [6]:
sequences = tokenizer.texts_to_sequences(sentences)

In [7]:
sequences

[[2, 3, 4, 6], [2, 3, 4, 5], [2, 3, 4, 5], [2, 7, 8, 9, 1], [2], [1]]

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [9]:
sequences = pad_sequences(sequences, padding='post', truncating='post')

In [10]:
sequences

array([[2, 3, 4, 6, 0],
       [2, 3, 4, 5, 0],
       [2, 3, 4, 5, 0],
       [2, 7, 8, 9, 1],
       [2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]], dtype=int32)

# Sentiment Analyzer

In [None]:
import pandas as pd
import numpy as np

In [None]:
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
data = pd.read_csv('IMDB Dataset.csv')

In [None]:
data.head()

In [None]:
data['sentiment'].unique()

In [None]:
from nltk.corpus import stopwords

In [None]:
english_stops = set(stopwords.words('english'))

In [None]:
#print(english_stops)

In [None]:
X_data = data['review']
y_data = data['sentiment']

In [None]:
X_data = X_data.replace({'<.*?>': ''}, regex = True) # remove html tag

In [None]:
X_data = X_data.replace({'[^A-Za-z]': ' '}, regex = True) # remove non alphabetical characters

In [None]:
X_data = X_data.apply(lambda review: [w for w in review.split() if w not in english_stops]) # Stopwords

In [None]:
X_data = X_data.apply(lambda review: [w.lower() for w in review]) # Convert to lowercase

In [None]:
y_data = y_data.replace('positive', 1)
y_data = y_data.replace('negative', 0)

In [None]:
#X_data[0:5]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, 
                                                    y_data, 
                                                    test_size = 0.2, 
                                                    random_state=12345)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
def get_max_length():
    review_length = []
    for review in X_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
# to encode text to int
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
token = Tokenizer(lower=False)

In [None]:
token.fit_on_texts(X_train)

In [None]:
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [None]:
max_length = get_max_length()

In [None]:
max_length

In [None]:
X_train = pad_sequences(X_train, 
                        maxlen=max_length, 
                        padding='post', 
                        truncating='post')

In [None]:
X_test = pad_sequences(X_test, 
                       maxlen=max_length, 
                       padding='post', 
                       truncating='post')

In [None]:
total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', X_train, '\n')
print('Encoded X Test\n', X_test, '\n')
print('Maximum review length: ', max_length)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
model = Sequential()

In [None]:
model.add(Embedding(total_words, 
                    32, 
                    input_length = max_length))

In [None]:
model.add(LSTM(256, 
               return_sequences=True))
model.add(LSTM(256,return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer = 'adam', 
              loss = 'binary_crossentropy', 
              metrics = ['accuracy'])
print(model.summary())

In [None]:
model.fit(X_train, 
          y_train,
          validation_data=(X_test,y_test), 
          epochs = 5)

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
y_test_pred = y_test_pred >= 0.5

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
confusion_matrix(y_pred = y_test_pred, y_true = y_test)

In [None]:
accuracy_score(y_pred = y_test_pred, y_true = y_test)