# Importing necessary libraries

In [1]:
# Importing necessary libraries
import nltk
from nltk.corpus import movie_reviews, stopwords  # For loading movie reviews and English stopwords
from nltk.classify import NaiveBayesClassifier     # Native NLTK Naive Bayes classifier
from nltk.classify import SklearnClassifier        # Wrapper for using scikit-learn classifiers in NLTK
from nltk.classify.util import accuracy as nltk_accuracy  # Utility to calculate accuracy of classifiers
from sklearn.ensemble import RandomForestClassifier       # Random Forest classifier from sklearn
from sklearn.linear_model import LogisticRegression       # Logistic Regression classifier
from sklearn.svm import LinearSVC                         # Linear Support Vector Classifier
import random  # For shuffling data

In [2]:
# Downloading required NLTK resources (movie reviews and stopwords)
nltk.download('movie_reviews')
nltk.download('stopwords')

# Function to convert a list of words into a feature dictionary
# Each word becomes a key with value True indicating its presence
def extract_features(words):
    return {word: True for word in words}

# Loading the movie reviews dataset from NLTK
# Each document is represented as a list of words and its corresponding category (pos/neg)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Randomly shuffling the dataset to ensure a good mix of positive and negative samples
random.shuffle(documents)

# Creating the feature sets by applying our feature extractor on each document
featuresets = [(extract_features(d), c) for (d, c) in documents]

# Splitting data into training and test sets
# Using first 1600 for training and the remaining 400 for testing
train_set, test_set = featuresets[:1600], featuresets[1600:]


# Training and Evaluating Models


# Training the Naive Bayes Classifier using NLTK's native implementation
nb_classifier = NaiveBayesClassifier.train(train_set)
# Printing accuracy of Naive Bayes on test data
print(f"Naive Bayes Accuracy: {nltk_accuracy(nb_classifier, test_set) * 100:.2f}%")

# Training Random Forest Classifier using scikit-learn via NLTK wrapper
rf_classifier = SklearnClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_classifier.train(train_set)
# Printing accuracy of Random Forest on test data
print(f"Random Forest Accuracy: {nltk_accuracy(rf_classifier, test_set) * 100:.2f}%")

# Training Logistic Regression Classifier using scikit-learn
logistic_classifier = SklearnClassifier(LogisticRegression(max_iter=1000, solver='liblinear'))
logistic_classifier.train(train_set)
# Printing accuracy of Logistic Regression on test data
print(f"Logistic Regression Accuracy: {nltk_accuracy(logistic_classifier, test_set) * 100:.2f}%")

# Training Support Vector Machine (SVM) classifier using scikit-learn
svm_classifier = SklearnClassifier(LinearSVC())
svm_classifier.train(train_set)
# Printing accuracy of SVM on test data
print(f"SVM Accuracy: {nltk_accuracy(svm_classifier, test_set) * 100:.2f}%")

# Showing the 10 most informative features for Naive Bayes
# (Words that best distinguish between positive and negative reviews)
nb_classifier.show_most_informative_features(10)

# Sentiment Prediction Function


# This function takes a raw input sentence, removes stopwords, and predicts sentiment using all 4 classifiers
def analyze_sentiment(text):
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    # Tokenizing the sentence by splitting on spaces and filtering out stopwords
    words = [word for word in text.split() if word.lower() not in stop_words]
    features = extract_features(words)  # Convert cleaned words to features
    return {
        "Naive Bayes": nb_classifier.classify(features),
        "Random Forest": rf_classifier.classify(features),
        "Logistic Regression": logistic_classifier.classify(features),
        "SVM": svm_classifier.classify(features)
    }

# Testing on Example Sentences


# A list of sample movie review sentences for testing sentiment analysis
test_sentences = [
    "This movie is absolutely fantastic! The acting, the story, everything was amazing!",
    "I hated this movie. It was a waste of time and money.",
    "The plot was a bit dull, but the performances were great.",
    "I have mixed feelings about this film. It was okay, not great but not terrible either."
]

# For each test sentence, predict sentiment using all 4 classifiers and print the results
for sentence in test_sentences:
    predictions = analyze_sentiment(sentence)
    print(f"\nSentence: {sentence}")
    for model, pred in predictions.items():
        print(f"{model} Predicted Sentiment: {pred}")


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\FAUZAN\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FAUZAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Naive Bayes Accuracy: 76.50%
Random Forest Accuracy: 82.00%
Logistic Regression Accuracy: 85.50%
SVM Accuracy: 83.50%
Most Informative Features
                   sucks = True              neg : pos    =     14.5 : 1.0
               ludicrous = True              neg : pos    =     12.0 : 1.0
              astounding = True              pos : neg    =     11.5 : 1.0
              unbearable = True              neg : pos    =     11.2 : 1.0
                 insipid = True              neg : pos    =     10.5 : 1.0
                 idiotic = True              neg : pos    =     10.4 : 1.0
                  avoids = True              pos : neg    =     10.2 : 1.0
               stupidity = True              neg : pos    =      9.9 : 1.0
                    3000 = True              neg : pos    =      9.8 : 1.0
             fascination = True              pos : neg    =      9.5 : 1.0

Sentence: This movie is absolutely fantastic! The acting, the story, everything was amazing!
Naive Bayes 