This notebook contains the baseline random classifier model using SKLearn's DummyClassifier. The performance of this model will act as a baseline for the performance of the other models.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("IMDB Dataset Processed Lemma test.csv")

###Representing the textual data in a suitable model (i.e. Bag of Words, TF-IDF Vectors)

#Represent the text data using Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['cleaned_review'])

#Alternatively, represent the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_review'])


###Splitting the data into the training and test sets. Ensure that the train and test datasets are balanced by using stratify on the sentiments data

#Labels (i.e. Sentiment)
y = df['sentiment']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#Get the mapping of the numeric labels to the original labels
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label encoding mapping:")
print(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42, stratify=y)

Label encoding mapping:
{'negative': 0, 'positive': 1}


In [4]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, make_scorer, f1_score, roc_curve, auc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dummy_clf = DummyClassifier(strategy='uniform', random_state=42)

# Fit the baseline classifier on the training data
dummy_clf.fit(X_train, y_train)

# Predict the classes and the probabilities using the best model
predicted_class = dummy_clf.predict(X_test)
predicted_class_train = dummy_clf.predict(X_train)
test_probs = dummy_clf.predict_proba(X_test)
train_probs = dummy_clf.predict_proba(X_train)

# Calculate and print the performance metrics
print('Train confusion matrix is:')
print(confusion_matrix(y_train, predicted_class_train))
print('Test confusion matrix is:')
print(confusion_matrix(y_test, predicted_class))
print(classification_report(y_test, predicted_class))

# Calculate train and test accuracy
train_accuracy = accuracy_score(y_train, predicted_class_train)
test_accuracy = accuracy_score(y_test, predicted_class)
print("Train accuracy score: ", train_accuracy)
print("Test accuracy score: ", test_accuracy)

# Calculate and print the AUC-ROC score
train_auc = roc_auc_score(y_train, train_probs[:, 1], multi_class = 'ovr')
test_auc = roc_auc_score(y_test, test_probs[:, 1], multi_class='ovr')
print("Train ROC-AUC score:", train_auc)
print("Test ROC-AUC score:", test_auc)

Train confusion matrix is:
[[ 9940  9818]
 [ 9845 10062]]
Test confusion matrix is:
[[2455 2485]
 [2529 2448]]
              precision    recall  f1-score   support

           0       0.49      0.50      0.49      4940
           1       0.50      0.49      0.49      4977

    accuracy                           0.49      9917
   macro avg       0.49      0.49      0.49      9917
weighted avg       0.49      0.49      0.49      9917

Train accuracy score:  0.5042732887936467
Test accuracy score:  0.49440354946052234
Train ROC-AUC score: 0.5
Test ROC-AUC score: 0.5
