<H1>OPENING AND EXTRACTING DATA</H1>

In [1]:

positive_file = "rt-polarity.pos"
negative_file = "rt-polarity.neg"


with open(positive_file, 'r', encoding='latin-1') as f:
    positive_sentences = f.readlines()


with open(negative_file, 'r', encoding='latin-1') as f:
    negative_sentences = f.readlines()


print(f"Number of positive sentences: {len(positive_sentences)}")
print(f"Number of negative sentences: {len(negative_sentences)}")


print("First positive sentence:", positive_sentences[0])
print("First negative sentence:", negative_sentences[0])


Number of positive sentences: 5331
Number of negative sentences: 5331
First positive sentence: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

First negative sentence: simplistic , silly and tedious . 



<H1>SPLITTING THE DATA</H1>


In [2]:
import random


train_pos = positive_sentences[:4000]
train_neg = negative_sentences[:4000]

val_pos = positive_sentences[4000:4500]
val_neg = negative_sentences[4000:4500]

test_pos = positive_sentences[4500:]
test_neg = negative_sentences[4500:]


x_train = train_pos + train_neg
y_train = [1] * len(train_pos) + [0] * len(train_neg)

x_val = val_pos + val_neg
y_val = [1] * len(val_pos) + [0] * len(val_neg)

x_test = test_pos + test_neg
y_test = [1] * len(test_pos) + [0] * len(test_neg)


train_data = list(zip(x_train, y_train))
random.shuffle(train_data)
x_train, y_train = zip(*train_data)

val_data = list(zip(x_val, y_val))
random.shuffle(val_data)
x_val, y_val = zip(*val_data)

test_data = list(zip(x_test, y_test))
random.shuffle(test_data)
x_test, y_test = zip(*test_data)

print(f"Training set size: {len(x_train)}")
print(f"Validation set size: {len(x_val)}")
print(f"Test set size: {len(x_test)}")


Training set size: 8000
Validation set size: 1000
Test set size: 1662


<H1>USING COUNT VECTORIZER</H1>

In [3]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(stop_words='english', max_features=8000)


x_train_vec = vectorizer.fit_transform(x_train)
x_val_vec = vectorizer.transform(x_val)
x_test_vec = vectorizer.transform(x_test)


print(f"Training set vectorized shape: {x_train_vec.shape}")
print(f"Validation set vectorized shape: {x_val_vec.shape}")
print(f"Test set vectorized shape: {x_test_vec.shape}")


Training set vectorized shape: (8000, 8000)
Validation set vectorized shape: (1000, 8000)
Test set vectorized shape: (1662, 8000)


<H1> TESTING </H1>

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


x_train_dense = x_train_vec.toarray()
x_val_dense = x_val_vec.toarray()
x_test_dense = x_test_vec.toarray()


y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

def build_model(input_dim):
    model = Sequential()
    model.add(Dense(256, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.4))  
    model.add(Dense(256, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='sigmoid'))  

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


input_dim = x_train_dense.shape[1]  
model = build_model(input_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

history = model.fit(x_train_dense, y_train, epochs=15, batch_size=32, validation_data=(x_val_dense, y_val),callbacks=[early_stopping])


val_loss, val_accuracy = model.evaluate(x_val_dense, y_val)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

test_loss, test_accuracy = model.evaluate(x_test_dense, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.5808 - loss: 0.6577 - val_accuracy: 0.7380 - val_loss: 0.5297
Epoch 2/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8749 - loss: 0.3088 - val_accuracy: 0.7580 - val_loss: 0.5297
Epoch 3/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9641 - loss: 0.1167 - val_accuracy: 0.7670 - val_loss: 0.7470
Epoch 4/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9897 - loss: 0.0350 - val_accuracy: 0.7600 - val_loss: 1.0752
Epoch 5/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9952 - loss: 0.0154 - val_accuracy: 0.7540 - val_loss: 1.3289
Epoch 6/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9979 - loss: 0.0078 - val_accuracy: 0.7520 - val_loss: 1.5547
[1m32/32[0m [32m━━━━━━━━━

<H1> EVALUATION </H1>

In [5]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Make predictions using the TensorFlow model
y_test_pred_probs = model.predict(x_test_dense)
y_test_pred = (y_test_pred_probs > 0.5).astype(int).flatten()  # Convert probabilities to binary labels

# Compute confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()  # Get TP, TN, FP, FN from the confusion matrix

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

# Print results
print("Confusion Matrix:")
print(f"TP (True Positives): {tp}")
print(f"TN (True Negatives): {tn}")
print(f"FP (False Positives): {fp}")
print(f"FN (False Negatives): {fn}")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")


[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Confusion Matrix:
TP (True Positives): 614
TN (True Negatives): 616
FP (False Positives): 215
FN (False Negatives): 217
Precision: 74.07%
Recall: 73.89%
F1-Score: 73.98%
