<a href="https://colab.research.google.com/github/FaizaJaseema/Sentimental-Analysis-of-IMDB-Dataset-using-LSTM/blob/main/sentiment_analysis_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd


# Data set adding

In [None]:
# Load your dataset (adjust the path as needed)
df = pd.read_csv('a1_IMDB_Dataset.csv')  # Replace with the correct path

# Ensure the dataset has 'review' and 'sentiment' columns
print(df.head(10))

max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df['review'].values)
X = tokenizer.texts_to_sequences(df['review'].values)
X = pad_sequences(X)


# Parameters

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Input
embed_dim = 64
lstm_out = 16
max_features = 2000  # Vocabulary size

# Assuming X.shape[1] is the sequence length (1939 based on the model summary)
input_length = X.shape[1]

# Build the model using Input() for the input layer
model = Sequential()

# Specify input shape using Input() at the start of the Sequential model
model.add(Input(shape=(input_length,)))

# Add Embedding layer
model.add(Embedding(max_features, embed_dim))

# Add LSTM layer
model.add(LSTM(lstm_out))

# Add Dense layer with sigmoid activation
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
print(model.summary())


# New Section

In [None]:
Y = df['sentiment'].values

# Train and Test Shape

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.1)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

# Model Training

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

# Define the model
lstm_model = Sequential()

# Define the input shape explicitly using Input() layer
lstm_model.add(Input(shape=(maxlen,)))

# Add the embedding layer (without input_length or input_dim, as it's inferred from the Input layer)
embedding_layer = Embedding(input_dim=vocab_length,
                            output_dim=100,
                            weights=[embedding_matrix],
                            trainable=False)

# Add embedding layer to the model
lstm_model.add(embedding_layer)

# Add LSTM layer
lstm_model.add(LSTM(128))

# Add Dense output layer
lstm_model.add(Dense(1, activation='sigmoid'))

# Compile the model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# Print the model summary to check if it's built correctly
print(lstm_model.summary())

lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)
score = lstm_model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])
import matplotlib.pyplot as plt

plt.plot(lstm_model_history.history['acc'])
plt.plot(lstm_model_history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(lstm_model_history.history['loss'])
plt.plot(lstm_model_history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()


In [None]:
predictions_nn_train = lstm_model.predict(X_train_padded)
predictions_nn_test = lstm_model.predict(X_test_padded)

predictions_nn_train = (predictions_nn_train > 0.5).astype(int)
predictions_nn_test = (predictions_nn_test > 0.5).astype(int)

# Calculate accuracy
train_accuracy = accuracy_score(Y_train, predictions_nn_train)
test_accuracy = accuracy_score(Y_test, predictions_nn_test)

print('Train accuracy:', train_accuracy)
print('Test accuracy:', test_accuracy)


# Confusion Matrix

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
#  Generate Confusion Matrix
# Training Set
cm_train = confusion_matrix(Y_train, predictions_nn_train)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix for LSTM - Train Set')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
# Testing Set
cm_test = confusion_matrix(Y_test, predictions_nn_test)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix for LSTM - Test Set')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


In [None]:
import numpy as np
import pandas as pd

# Reverse the dictionary to map indices to words (for faster lookup)
reverse_dictionary = {val: key for key, val in word_tokenizer.word_index.items()}

# Initialize a list to store the sentences
sentences = []

# Iterate through tokenized X_test and reconstruct sentences
for j in range(len(X_test_padded)):
    # Rebuild the sentence by mapping each token to its word using the reverse_dictionary
    sentence = [reverse_dictionary.get(X_test_padded[j][i], '') for i in range(len(X_test_padded[j])) if X_test_padded[j][i] != 0]
    sentences.append(' '.join(sentence))  # Join tokens to form the sentence

# Assuming `predictions_nn_test` is the predictions from the neural network
# Ensure predictions are a numpy array
predictions_nn_test = np.array(predictions_nn_test)

# Print the shape of predictions to check for any mismatches
print("Shape of predictions_nn_test:", predictions_nn_test.shape)

# Check if the size matches the number of rows in Y_test and reshape if necessary
if predictions_nn_test.shape[0] == len(Y_test):
    # Reshape predictions only if it matches the size of Y_test
    predictions_nn_test = predictions_nn_test.reshape(len(Y_test),)
else:
    print(f"Cannot reshape predictions_nn_test to ({len(Y_test)},). Current shape: {predictions_nn_test.shape}")

# Create the error analysis DataFrame
err_analysis = pd.DataFrame({
    'sentences': sentences,
    'y_true': Y_test,
    'y_pred': predictions_nn_test
})

# Display the first 20 rows of the DataFrame for error analysis
print(err_analysis.head(20))


# Error Analysis

In [None]:
errors = err_analysis.loc[err_analysis['y_pred']!=err_analysis['y_true']]
errors.head(8)

# Final output

In [None]:
df = pd.read_csv('a1_IMDB_Dataset.csv')

df_neg = df[ df['sentiment'] == 'positive']
df_pos = df[df['sentiment'] == 'negative']

all_count_pos = len(df_pos)
all_count_neg = len(df_neg)
print('Count positives: ', all_count_pos)
print('Count negatives: ', all_count_neg)
err_count_pos = len(errors[ errors['y_true'] == 1])
err_count_neg = len(errors[ errors['y_true'] == 0])
print('Errors in true positive: ', err_count_pos)
print('Errors in true negative: ', err_count_neg)
print('Fraction of the errors with true positive:', round(err_count_pos/all_count_pos, 4))
print('Fraction of the errors with true negative:', round(err_count_neg/all_count_neg, 4))
