In [27]:
import pandas as pd
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [28]:
def read_data(filename):
    with open(filename, 'r') as f:
        return f.readlines()

In [29]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Ravi
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Ravi
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
posi, negi = read_data('./rt-polarity.pos'), read_data('./rt-polarity.neg')

In [31]:
def preprocess_data(text):
    tokens = text.split()
    tokens = [token.lower() for token in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return ' '.join(tokens)

In [32]:
posi = [preprocess_data(sentence) for sentence in posi]
negi = [preprocess_data(sentence) for sentence in negi]

In [33]:
train_pos = posi[:4000]
train_neg = negi[:4000]

val_pos = posi[4000:4500]
val_neg = negi[4000:4500]

test_pos = posi[4500:]
test_neg = negi[4500:]

In [34]:
train_data = train_pos + train_neg
train_labels = [1] * 4000 + [0] * 4000

val_data = val_pos + val_neg
val_labels = [1] * 500 + [0] * 500

test_data = test_pos + test_neg
test_labels = [1] * 831 + [0] * 831

In [35]:
combined_train = list(zip(train_data, train_labels))
combined_val = list(zip(val_data, val_labels))
combined_test = list(zip(test_data, test_labels))

In [36]:
random.shuffle(train_data)
random.shuffle(val_data)
random.shuffle(test_data)

In [37]:
train_data, train_labels = zip(*combined_train)
val_data, val_labels = zip(*combined_val)
test_data, test_labels = zip(*combined_test)

In [38]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

In [39]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, train_labels)

log_reg_preds = log_reg_model.predict(X_test)

In [40]:
print("Logistic Regression Report:")
print(classification_report(test_labels, log_reg_preds))
print("Confusion Matrix (Logistic Regression):")
print(confusion_matrix(test_labels, log_reg_preds))
print("Accuracy Score (Logistic Regression)")
print(accuracy_score(test_labels, log_reg_preds)*100)

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.75      0.76      0.75       831
           1       0.76      0.74      0.75       831

    accuracy                           0.75      1662
   macro avg       0.75      0.75      0.75      1662
weighted avg       0.75      0.75      0.75      1662

Confusion Matrix (Logistic Regression):
[[633 198]
 [214 617]]
Accuracy Score (Logistic Regression)
75.21058965102286


In [41]:
svm_model = SVC()
svm_model.fit(X_train, train_labels)

svm_preds = svm_model.predict(X_test)

In [42]:
print("SVM Report:")
print(classification_report(test_labels, svm_preds))
print("Confusion Matrix (SVM):")
print(confusion_matrix(test_labels, svm_preds))
print("Accuracy Score (SVM)")
print(accuracy_score(test_labels, svm_preds)*100)

SVM Report:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       831
           1       0.77      0.73      0.75       831

    accuracy                           0.75      1662
   macro avg       0.75      0.75      0.75      1662
weighted avg       0.75      0.75      0.75      1662

Confusion Matrix (SVM):
[[646 185]
 [227 604]]
Accuracy Score (SVM)
75.21058965102286


In [43]:
X_train_dense = X_train.toarray()
X_val_dense = X_val.toarray()
X_test_dense = X_test.toarray()

In [44]:
X_train_dense = X_train.toarray()
X_val_dense = X_val.toarray()
X_test_dense = X_test.toarray()

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [45]:
model.summary()

In [46]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_dense, np.array(train_labels), epochs=5, batch_size=32, validation_data=(X_val_dense, np.array(val_labels)))

test_loss, test_acc = model.evaluate(X_test_dense, np.array(test_labels))

rnn_preds = model.predict(X_test_dense).round()

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.5781 - loss: 0.6640 - val_accuracy: 0.7600 - val_loss: 0.5032
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.8797 - loss: 0.2967 - val_accuracy: 0.7460 - val_loss: 0.5470
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9668 - loss: 0.1111 - val_accuracy: 0.7470 - val_loss: 0.7179
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9880 - loss: 0.0412 - val_accuracy: 0.7510 - val_loss: 0.8844
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9958 - loss: 0.0210 - val_accuracy: 0.7470 - val_loss: 1.0350
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7484 - loss: 1.1167
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [47]:
print("RNN Report: ")
print(classification_report(test_labels, rnn_preds))
print("Confusion Matrix (RNN): ")
print(confusion_matrix(test_labels, rnn_preds))
print("Accuracy Score (RNN): ")
print(test_acc*100)

RNN Report: 
              precision    recall  f1-score   support

           0       0.76      0.74      0.75       831
           1       0.75      0.76      0.75       831

    accuracy                           0.75      1662
   macro avg       0.75      0.75      0.75      1662
weighted avg       0.75      0.75      0.75      1662

Confusion Matrix (RNN): 
[[619 212]
 [199 632]]
Accuracy Score (RNN): 
75.27076005935669
