In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
# CountVectorizer
from sklearn.feature_extraction.text  import CountVectorizer

cv = CountVectorizer(dtype='uint8', binary=True)

cv.fit(df['review'])
dataset_x = cv.transform(df['review']).todense()
dataset_y = (df['sentiment'] == 'positive').to_numpy(dtype='uint8') 


In [3]:
# without CountVectorizer
vocab =  set()
import re

for text in df['review']:
    words = re.findall('[a-zA-Z0-9]+', text.lower())
    vocab.update(words)

vocab_dict = {word: index for index, word in enumerate(vocab)}

import numpy as np

dataset_x = np.zeros((len(df), len(vocab)), dtype='uint8')  
for row, text in enumerate(df['review']):
    words = re.findall('[a-zA-Z0-9]+', text.lower())
    word_numbers = [vocab_dict[word] for word in words]
    dataset_x[row, word_numbers] = 1

dataset_y = (df['sentiment'] == 'positive').to_numpy(dtype='uint8') 

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
training_dataset_x, test_dataset_x, training_dataset_y, test_dataset_y = train_test_split(dataset_x, dataset_y, test_size=0.2)

In [15]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense

In [19]:
model = Sequential(name='IMDB')

In [None]:
model.add(Input((training_dataset_x.shape[1],)))
model.add(Dense(128, activation='relu', name='Hidden-1'))
model.add(Dense(128, activation='relu', name='Hidden-2'))
model.add(Dense(1, activation='sigmoid', name='Output'))
model.summary()

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['binary_accuracy'])
hist = model.fit(training_dataset_x, training_dataset_y, batch_size=32, epochs=5, validation_split=0.2)

In [22]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(14, 6))
plt.title('Epoch - Loss Graph', pad=10, fontsize=14)
plt.xticks(range(0, 300, 10))
plt.plot(hist.epoch, hist.history['loss'])
plt.plot(hist.epoch, hist.history['val_loss'])
plt.legend(['Loss', 'Validation Loss'])
plt.show()

In [None]:
plt.figure(figsize=(14, 6))
plt.title('Epoch - Binary Accuracy Graph', pad=10, fontsize=14)
plt.xticks(range(0, 300, 10))
plt.plot(hist.epoch, hist.history['binary_accuracy'])
plt.plot(hist.epoch, hist.history['val_binary_accuracy'])
plt.legend(['Accuracy', 'Validation Accuracy'])
plt.show()

In [None]:
eval_result = model.evaluate(test_dataset_x, test_dataset_y, batch_size=32)

In [None]:
for i in range(len(eval_result)):
    print(f'{model.metrics_names[i]}: {eval_result[i]}')

In [29]:
# prediction
predict_df = pd.read_csv('predict-imdb.csv')

In [None]:
predict_dataset_x = np.zeros((len(predict_df), len(vocab)))
for row, text in enumerate(predict_df['review']):
    words = re.findall('[a-zA-Z0-9]+', text.lower())
    word_numbers = [vocab_dict[word] for word in words]
    predict_dataset_x[row, word_numbers] = 1

In [31]:
# For CountVectorizer
predict_dataset_x = cv.transform(predict_df['review']).todense()

In [None]:
predict_result = model.predict(predict_dataset_x)

In [None]:
for presult in predict_result[:, 0]:
    if (presult > 0.5):
        print('Positive')
    else:
        print('Negative')

In [None]:
# dataset_x'teki birinci yorumun yazı haline getirilmesi
rev_vocab_dict = {index: word for word, index in vocab_dict.items()}

word_indices = np.argwhere(dataset_x[0] == 1).flatten()
words = [rev_vocab_dict[index] for index in word_indices]
text = ' '.join(words)
print(text)

In [None]:
# dataset_x'teki birinci yorumun yazı haline getirilmesi
import numpy as np

rev_vocab_dict = {index: word for word, index in cv.vocabulary_.items()}

word_indices = np.argwhere(dataset_x[0] == 1)[:, 1]
words = [rev_vocab_dict[index] for index in word_indices]
text = ' '.join(words)
print(text)