# CM4107 Advanced Artificial Intelligence - Coursework
## Comparative Evaluation of Two NLP Algorithms on Sentiment Classification Task
Author: [Your Name]
GitHub Repository: https://github.com/[your-repo-link]

### Section 1: Dataset Description and Preprocessing
The IMDB Dataset of Movie Reviews consists of 50,000 reviews with binary (positive/negative) sentiment labels.
Reviews are divided into 25,000 for training and 25,000 for testing. Basic preprocessing such as tokenization
and vectorization will be applied. We will work with the HuggingFace `datasets` library for simplicity.

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load Dataset
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

# Extract texts and labels
train_texts, train_labels = train_data["text"], train_data["label"]
test_texts, test_labels = test_data["text"], test_data["label"]

# Preprocess text: vectorization
# Using TF-IDF for Algorithm 1 (Naive Bayes) and Word Embeddings for Algorithm 2 (LSTM)
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

print("TF-IDF Vectorization Completed!")
print(f"Training Feature Shape: {X_train_tfidf.shape}, Test Feature Shape: {X_test_tfidf.shape}")

### Section 2: Representation Learning
Representation learning will use two forms:
- TF-IDF Vectorization for traditional algorithms (Naive Bayes)
- Token embeddings for neural methods (e.g., LSTM with GloVe embeddings).

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization for Neural Networks
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)
X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_test_seq = tokenizer.texts_to_sequences(test_texts)

# Padding sequences
max_length = 200  # Maximum review length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding="post", truncating="post")
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding="post", truncating="post")

print(f"Padded Sequence Shape: {X_train_padded.shape}, {X_test_padded.shape}")

### Section 3: Algorithms
#### Algorithm 1: Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train Naive Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, train_labels)

# Evaluate Naive Bayes
nb_predictions = nb_model.predict(X_test_tfidf)
nb_accuracy = accuracy_score(test_labels, nb_predictions)
nb_precision = precision_score(test_labels, nb_predictions)
nb_recall = recall_score(test_labels, nb_predictions)
nb_f1 = f1_score(test_labels, nb_predictions)

print("Naive Bayes Performance:")
print(f"Accuracy: {nb_accuracy}, Precision: {nb_precision}, Recall: {nb_recall}, F1-Score: {nb_f1}")

#### Algorithm 2: Long Short-Term Memory (LSTM) Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# LSTM Model Architecture
vocab_size = 10000
embedding_dim = 100
lstm_units = 64

lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(lstm_units, return_sequences=False),
    Dense(1, activation="sigmoid")
])

lstm_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Training LSTM Model
lstm_model.fit(X_train_padded, train_labels, epochs=3, batch_size=64, validation_split=0.2)

# Evaluate LSTM
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_padded, test_labels)
print(f"LSTM Loss: {lstm_loss}, LSTM Accuracy: {lstm_accuracy}")

### Section 4: Evaluation and Comparison
Performance Evaluation Metrics: Accuracy, Precision, Recall, F1-Score

In [None]:
# LSTM Metrics
from sklearn.metrics import precision_score, recall_score, f1_score

lstm_predictions = (lstm_model.predict(X_test_padded) > 0.5).astype("int32").flatten()
lstm_precision = precision_score(test_labels, lstm_predictions)
lstm_recall = recall_score(test_labels, lstm_predictions)
lstm_f1 = f1_score(test_labels, lstm_predictions)

print(f"LSTM Results:\nAccuracy: {lstm_accuracy}\nPrecision: {lstm_precision}\nRecall: {lstm_recall}\nF1: {lstm_f1}")

In [None]:
# Visualization of Results
import matplotlib.pyplot as plt

# Create a bar chart comparing algorithm performance
metrics = ["Accuracy", "Precision", "Recall", "F1-Score"]
nb_scores = [nb_accuracy, nb_precision, nb_recall, nb_f1]
lstm_scores = [lstm_accuracy, lstm_precision, lstm_recall, lstm_f1]

x = range(len(metrics))
plt.bar(x, nb_scores, width=0.4, label="Naive Bayes")
plt.bar([i + 0.4 for i in x], lstm_scores, width=0.4, label="LSTM")
plt.xticks([i + 0.2 for i in x], metrics)
plt.ylabel("Score")
plt.title("Algorithm Performance Comparison")
plt.legend()
plt.show()