<a href="https://colab.research.google.com/github/GabrielJuniorNdlovu/movie-rating-text-classification/blob/main/movie_rating_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation and setup

In [None]:
!pip install tensorflow
!pip install numpy
from tensorflow import keras
import numpy as np

# Data pre-processing

In [3]:
# import dataset
data = keras.datasets.imdb

In [5]:
# split data in database into training and testing
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
print(train_data[0])

In [13]:
# generates a dictionary with mappings to the words the numbers represent
word_index = data.get_word_index()

In [14]:
# create word mappings to get what the integers mean
word_index = {k:(v+3) for k, v in word_index.items()}
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3



In [15]:
# reverses the number into a word
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [34]:
# put padding to make reviews be the same length
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index['<PAD>'], padding='post', maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index['<PAD>'], padding='post', maxlen=250)

In [21]:
# gives us human readable words
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
print(decode_review(train_data[0]))
print(decode_review(test_data[0]))

# Model

In [27]:
model = keras.Sequential()
model.add(keras.layers.Embedding(10000, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [32]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training

In [35]:
# split training data to cater for validation data
x_val = train_data[:10000]
x_train = train_data[10000:]

y_val = train_labels[:10000]
y_train = train_labels[10000:]

In [None]:
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)

In [None]:
results = model.evaluate(test_data, test_labels)

In [None]:
print(results)

In [None]:
test_review = test_data[0]
predict = model.predict([test_review])
print('Review: ')
print(decode_review(test_review))
print('Prediction: ' + str(predict[0]))
print('Actual: ' + str(test_labels[0]))