<a href="https://colab.research.google.com/github/ahmed-sala/NLP-Assignment/blob/main/20210064_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Using RNN**

In [26]:
import pandas as pd
import re
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional, SimpleRNN
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords

In [27]:
imdb_df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
reviews = imdb_df['review'].astype(str)
sentiments = imdb_df['sentiment']
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(sentiments)
vocab_size = 10000
sequence_length = 200
tknzr = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tknzr.fit_on_texts(reviews)
review_sequences = tknzr.texts_to_sequences(reviews)
X_padded = pad_sequences(review_sequences, maxlen=sequence_length, padding='post', truncating='post')
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y_encoded, test_size=0.2, random_state=42
)

In [28]:
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=sequence_length))
rnn_model.add(Bidirectional(LSTM(128)))
rnn_model.add(Dense(32, activation='relu'))
rnn_model.add(Dense(1, activation='sigmoid'))
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_history = rnn_model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.1
)
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test, y_test)
print("\nBidirectional LSTM Model")
print(f"Test Accuracy: {rnn_accuracy * 100:.2f}%")

Epoch 1/5




[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - accuracy: 0.6964 - loss: 0.5584 - val_accuracy: 0.8480 - val_loss: 0.3775
Epoch 2/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.8709 - loss: 0.3210 - val_accuracy: 0.8518 - val_loss: 0.3315
Epoch 3/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 24ms/step - accuracy: 0.9115 - loss: 0.2373 - val_accuracy: 0.8675 - val_loss: 0.3260
Epoch 4/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.9400 - loss: 0.1641 - val_accuracy: 0.8388 - val_loss: 0.4026
Epoch 5/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.9579 - loss: 0.1186 - val_accuracy: 0.8595 - val_loss: 0.4172
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.8609 - loss: 0.4119

Bidirectional LSTM Model
Test Accuracy: 86.27%


**Without RNN**

In [29]:
import pandas as pd
import re
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords


In [30]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
def preprocess_text(text):
    text = text.lower()
    text = re.sub("<.*?>", "", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
data['cleaned'] = data['review'].astype(str).apply(preprocess_text)
texts = data['cleaned']
labels = data['sentiment']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

In [32]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data['cleaned'])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_lr = logistic_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression")
print(f"Accuracy: {accuracy_lr:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_))

Logistic Regression
Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.86      0.86      4961
    positive       0.86      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

