In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding
import pickle

In [3]:
from google.colab import files

uploaded = files.upload()

Saving IMDB Dataset.csv to IMDB Dataset.csv


In [4]:
df = pd.read_csv(r"/content/IMDB Dataset.csv")

print(df.head(10))

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
5  Probably my all-time favorite movie, a story o...  positive
6  I sure would like to see a resurrection of a u...  positive
7  This show was an amazing, fresh & innovative i...  negative
8  Encouraged by the positive comments about this...  negative
9  If you like original gut wrenching laughter yo...  positive


In [5]:
data = {
    "review": ["Great movie!", "Terrible film.", "Loved it!", "Not good."],
    "sentiment": ["positive", "negative", "positive", "negative"]
}

df = pd.DataFrame(data)
print("Before replacement:")
print(df)


df["sentiment"].replace({"positive": 1, "negative": 0}, inplace=True)
print("\nAfter replacement:")
print(df)

Before replacement:
           review sentiment
0    Great movie!  positive
1  Terrible film.  negative
2       Loved it!  positive
3       Not good.  negative

After replacement:
           review  sentiment
0    Great movie!          1
1  Terrible film.          0
2       Loved it!          1
3       Not good.          0


In [6]:
x = np.array(df["review"].values)
y = np.array(df["sentiment"].values)

In [7]:
x_filtered = []
punc = '''!()-[]{}:;'"\,<>.@#$%&^*_'''

trans_table = str.maketrans(punc, ' ' * len(punc))

for review in x:
  review = review.lower()
  review = review.translate(trans_table)
  x_filtered.append(review)

print(x_filtered)


['great movie ', 'terrible film ', 'loved it ', 'not good ']


In [8]:
vocabulary_size = 5000
one_hot_encoded = [one_hot(review, vocabulary_size)for review in x_filtered]
print(one_hot_encoded)

[[3804, 1296], [3943, 1342], [2142, 2617], [260, 2715]]


In [9]:
max_length = 500
x_padded = pad_sequences(one_hot_encoded,max_length,padding = "post")

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_padded,y, test_size = 0.3)

In [11]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
vocabulary_size = 5000
max_length = 500
embedded_vector_size = 35
model = Sequential()
model.add(Embedding(vocabulary_size, embedded_vector_size, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 35)           175000    
                                                                 
 lstm (LSTM)                 (None, 100)               54400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 229501 (896.49 KB)
Trainable params: 229501 (896.49 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [12]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
score = model.evaluate(x_test, y_test)
print("Loss:", score[0])
print("Accuracy:", score[1])

Loss: 0.6931514739990234
Accuracy: 0.5


In [14]:
model.save("IMBD_sentiment_analysis")

In [15]:
from keras.models import load_model

trained_model = load_model('/content/IMBD_sentiment_analysis')

print(trained_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 35)           175000    
                                                                 
 lstm (LSTM)                 (None, 100)               54400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 229501 (896.49 KB)
Trainable params: 229501 (896.49 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(x_filtered)
with open('/content/tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
import pickle
from keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

def get_sentiment(sentence: str):
    if isinstance(sentence, str):
        sentence = sentence.lower()

        punc = '''!()-[]{}:;'"\,<>.@#$%&^*_'''
        for char in sentence:
            if char in punc:
                sentence = sentence.replace(char, " ")


        trained_model = load_model("/content/IMBD_sentiment_analysis")


        with open('/content/tokenizer.pkl', 'rb') as handle:
            tokenizer = pickle.load(handle)


        sequences = tokenizer.texts_to_sequences([sentence])
        padded_sequence = pad_sequences(sequences, maxlen=500)
        predicted = trained_model.predict(padded_sequence)
        sentiment = 1 if predicted > 0.5 else 0


        if sentiment == 1:
            print("Positive")
        else:
            print("Negative")

        return sentiment
    else:
        raise Exception("Input needs to be of type 'str'")


In [20]:
result = get_sentiment("That movie was good")
print(result)


Positive
1
