<a href="https://colab.research.google.com/github/CSSamarasinghe/SE4050_Assignment/blob/main/CNN_imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Load the IMDb dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)




In [6]:
# Pad sequences to ensure all input data is of the same length
max_length = 200
X_train_padded = pad_sequences(X_train, maxlen=max_length)
X_test_padded = pad_sequences(X_test, maxlen=max_length)


In [7]:
# Build the CNN model
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=10000, output_dim=128))  # Removed input_length as per warning

# 1D Convolutional Layer
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

# Global Max Pooling
model.add(GlobalMaxPooling1D())

# Fully connected layer
model.add(Dense(128, activation='relu'))

# Dropout to prevent overfitting
model.add(Dropout(0.5))

# Output layer for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Build the model by calling it with an input shape
model.build(input_shape=(None, max_length))

# Print the model summary
model.summary()

In [8]:
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_split=0.2)


Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 169ms/step - accuracy: 0.6389 - loss: 0.6013 - val_accuracy: 0.8662 - val_loss: 0.3173
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 166ms/step - accuracy: 0.9109 - loss: 0.2265 - val_accuracy: 0.8888 - val_loss: 0.2714
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 165ms/step - accuracy: 0.9779 - loss: 0.0803 - val_accuracy: 0.8920 - val_loss: 0.3138
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 167ms/step - accuracy: 0.9953 - loss: 0.0214 - val_accuracy: 0.8928 - val_loss: 0.4024
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 168ms/step - accuracy: 0.9995 - loss: 0.0050 - val_accuracy: 0.8942 - val_loss: 0.4254
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 169ms/step - accuracy: 1.0000 - loss: 0.0012 - val_accuracy: 0.8942 - val_loss: 0.4654
Epoch 7/10

In [9]:
test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {test_acc:.4f}")


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - accuracy: 0.8840 - loss: 0.5816
Test Accuracy: 0.8846


In [19]:
# Example to predict sentiment on a new review
def predict_sentiment(review):
    # Convert the review to a sequence of integers and pad
    review_seq = imdb.get_word_index()
    tokens = [review_seq.get(word, 2) for word in review.lower().split()]  # 2 is the default for unknown words
    padded_tokens = pad_sequences([tokens], maxlen=max_length)

    # Predict the sentiment (1 = positive, 0 = negative)
    prediction = model.predict(padded_tokens)[0][0]
    return 'Positive' if prediction > 0.5 else 'Negative'





In [20]:
# Example usage
print(predict_sentiment("i do not like this, agly!"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Negative
