This notebook contains the code for the Multinomial Naive Bayes classification model.

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("IMDB Dataset Processed Lemma test.csv")

###Representing the textual data in a suitable model (i.e. Bag of Words, TF-IDF Vectors)

#Represent the text data using Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['cleaned_review'])

#Alternatively, represent the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_review'])


###Splitting the data into the training and test sets. Ensure that the train and test datasets are balanced by using stratify on the sentiments data

#Labels (i.e. Sentiment)
y = df['sentiment']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#Get the mapping of the numeric labels to the original labels
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label encoding mapping:")
print(label_mapping)

X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    X_tfidf, y_encoded, df.index, test_size=0.2, random_state=42, stratify=y
)


Label encoding mapping:
{'negative': 0, 'positive': 1}


We implement the Multinomial Naive Bayes model here. We start off by conducting hyperparameter optimization using gridsearch to find the ideal hyperparameter. The ideal hyperparameter is one that allows the model to have the highest f1 score.
Once we have found the ideal hyperparameter, we train the model using that specific hyperparameter.

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, make_scorer, f1_score, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the parameter grid for alpha
param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]} # Smoothing parameter

# Initialize the MultinomialNB classifier
nb_classifier = MultinomialNB()

# Define the scoring metric with macro F1 score
scoring = make_scorer(f1_score, average='micro')
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, scoring=scoring, cv=5) # Cross validation with 5 folds

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']
best_score = grid_search.best_score_

print(f"Best alpha: {best_alpha}")
print(f"Best F1 score: {best_score}")

Best alpha: 2.0
Best F1 score: 0.8656498172192109


In [24]:
# Train the MultinomialNB classifier with the best alpha (Lidstone Smoothing)
mnb_classifier = MultinomialNB(alpha=best_alpha)
mnb_classifier.fit(X_train, y_train)

# Predict the classes and the probabilities using the best model
predicted_class = mnb_classifier.predict(X_test)
predicted_class_train = mnb_classifier.predict(X_train)
test_probs = mnb_classifier.predict_proba(X_test)
train_probs = mnb_classifier.predict_proba(X_train)

# Calculate and print the performance metrics
print('Train confusion matrix is:')
print(confusion_matrix(y_train, predicted_class_train))
print('Test confusion matrix is:')
print(confusion_matrix(y_test, predicted_class))
print(classification_report(y_test, predicted_class))

# Calculate train and test accuracy
train_accuracy = accuracy_score(y_train, predicted_class_train)
test_accuracy = accuracy_score(y_test, predicted_class)
print("Train accuracy score: ", train_accuracy)
print("Test accuracy score: ", test_accuracy)

# Calculate and print the AUC-ROC score
train_auc = roc_auc_score(y_train, train_probs[:, 1], multi_class = 'ovr')
test_auc = roc_auc_score(y_test, test_probs[:, 1], multi_class='ovr')
print("Train ROC-AUC score:", train_auc)
print("Test ROC-AUC score:", test_auc)

Train confusion matrix is:
[[17690  2068]
 [ 1996 17911]]
Test confusion matrix is:
[[4256  684]
 [ 661 4316]]
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      4940
           1       0.86      0.87      0.87      4977

    accuracy                           0.86      9917
   macro avg       0.86      0.86      0.86      9917
weighted avg       0.86      0.86      0.86      9917

Train accuracy score:  0.8975419135257784
Test accuracy score:  0.8643743067459917
Train ROC-AUC score: 0.9616374100901312
Test ROC-AUC score: 0.9390849730623215


In [30]:
import pandas as pd

def perform_error_analysis_mnb(y_true, y_pred, test_indices, df, label_mapping):
    # Set Pandas options to display full strings
    pd.set_option('display.max_colwidth', None)
    
    # Generate confusion matrix and classification report
    cm = confusion_matrix(y_true, y_pred)
    cr = classification_report(y_true, y_pred, target_names=label_mapping.keys())
    print("Confusion Matrix:")
    print(cm)
    print("\nClassification Report:")
    print(cr)

    # Map `test_indices` to extract the original reviews
    X_test_reviews = df.loc[test_indices, 'original_review']

    # Identify False Negatives (FN) and False Positives (FP)
    FN_indices = (y_true == 0) & (y_pred != 0)  # Replace 0 with the specific class of interest
    FP_indices = (y_true != 0) & (y_pred == 0)  # Replace 0 with the specific class of interest

    false_negatives = X_test_reviews.iloc[FN_indices]
    false_positives = X_test_reviews.iloc[FP_indices]
    sarcasm_candidates = X_test_reviews.iloc[y_pred != y_true]  # Misclassified texts
    sarcasm_examples = sarcasm_candidates[
        sarcasm_candidates.str.contains("great|fantastic|amazing", case=False, na=False)
    ]

    # Print first 5 examples for each category
    print("\nFalse Negatives (FN) examples:")
    print(false_negatives.head(5))
    print("\nFalse Positives (FP) examples:")
    print(false_positives.head(5))
    print("\nPotential Sarcasm Examples:")
    print(sarcasm_examples.head(5))

    # Save all errors to files
    false_negatives.to_csv("false_negatives.csv", index=False)
    false_positives.to_csv("false_positives.csv", index=False)
    sarcasm_examples.to_csv("potential_sarcasm.csv", index=False)

    # Reset Pandas options to default after displaying the output
    pd.reset_option('display.max_colwidth')

# Perform error analysis and save errors
perform_error_analysis_mnb(
    y_true=y_test,
    y_pred=predicted_class,
    test_indices=test_indices,
    df=df,
    label_mapping=label_mapping
)






Confusion Matrix:
[[4256  684]
 [ 661 4316]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.86      0.86      4940
    positive       0.86      0.87      0.87      4977

    accuracy                           0.86      9917
   macro avg       0.86      0.86      0.86      9917
weighted avg       0.86      0.86      0.86      9917


False Negatives (FN) examples:
34773    "An album of songs so old everyone thinks they're new." This film has the elusive combination of pace and mood that set some films apart from the opening moments. And why not? Towering talent from Dame Judith Dench as a widow who plays saxaphone with a street musician to help him get the songs right, to Olympia Dukakis as the merry widow living in a Scottish castle on the alimony of her many marriages, to Ian Holm as the drummer who loved all the members of a World War II all girl (more or less) swing band. But wait, there's more. Add in Leslie Caron on bas