In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
directory = "/home/maverick/MovieSentiment/Main/imdb.csv"
data = pd.read_csv(directory)
stop_words = stopwords.words("english")
wordnet = WordNetLemmatizer()

def text_preproc(x):
	x = x.lower()
	x = " ".join([word for word in x.split(" ") if word not in stop_words])
	x = x.encode("ascii", "ignore").decode()
	x = re.sub(r"https*\S+", " ", x)
	x = re.sub(r"@\S+", " ", x)
	x = re.sub(r"#\S+", " ", x)
	x = re.sub(r"\",\w+", "", x)
	x = re.sub("[%s]" % re.escape(string.punctuation), " ", x)
	x = re.sub(r"\w*\d+\w*", "", x)
	x = re.sub(r"\s{2,}", " ", x)
	return x
	
final_data = []
data_to_list = data["review"].values.tolist()
for i in range(len(data_to_list)):
	final_data.append(text_preproc(data_to_list[i]))
print(list(final_data[:5]))

final_data = np.array(final_data)

labels = np.array(data["sentiment"])
l = []
for i in range(len(labels)):
	if labels[i]=="negative":
		l.append(0)
	elif labels[i]=="positive":
		l.append(1)
l = np.array(l)
labels = tf.keras.utils.to_categorical(l,2,dtype="int32")
del l

print(len(labels))

['one reviewers mentioned watching oz episode hooked right exactly happened me br br the first thing struck oz brutality unflinching scenes violence set right word go trust me show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use word br br it called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home many aryans muslims gangstas latinos christians italians irish more so scuffles death stares dodgy dealings shady agreements never far away br br i would say main appeal show due fact goes shows dare forget pretty pictures painted mainstream audiences forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready it watched more developed taste oz got accustomed high levels graphic violence violence injustice crooked guards who ll sold nickel inmates who ll kill order get away it well manner

In [3]:
import pickle
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

In [4]:
max_words = 20000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(final_data)
sequences = tokenizer.texts_to_sequences(final_data)
tweets = pad_sequences(sequences, maxlen=max_len)
with open("tockenizer.pickle","wb") as handle:
	pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)	
print(tweets)
print(labels)

x_train, x_test, y_train, y_test = train_test_split(tweets,labels,random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42)
print(len(x_train),len(x_val),len(x_test),len(y_train),len(y_val),len(y_test))

model = Sequential([
	layers.Embedding(max_words,128,input_length=max_len),
	layers.Bidirectional(layers.LSTM(64,return_sequences=True)),
	layers.Bidirectional(layers.LSTM(64)),
	layers.Dense(2,activation="softmax"),
])
model.summary()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
checkpoint = ModelCheckpoint("model_best.hdf5", save_best_only=True, save_weights_only=False)
history = model.fit(x_train, y_train, epochs=3, validation_data=(x_val,y_val), callbacks=[checkpoint])

[[   0    0    0 ... 1136 4035  404]
 [   0    0    0 ... 1906   17  131]
 [   0    0    0 ...   61   16  267]
 ...
 [   0    0    0 ... 3923    2 5882]
 [   0    0    0 ... 4121  655  615]
 [   0    0    0 ... 4383  712    3]]
[[0 1]
 [0 1]
 [0 1]
 ...
 [1 0]
 [1 0]
 [1 0]]
28125 9375 12500 28125 9375 12500
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 128)          2560000   
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 128)          98816     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
Total params: 2,757,890
Trainable params: 2,757,890
Non-trainable params: 0


In [5]:
model_best = tf.keras.models.load_model("model_best.hdf5")
test_loss, test_acc, = model_best.evaluate(x_test, y_test, verbose=2)
print("Model accuracy: {:.2f} %".format(100*test_acc))
predictions = model_best.predict(x_test)

391/391 - 6s - loss: 0.2732 - accuracy: 0.8847
Model accuracy: 88.47 %
