In [29]:
#import necessary packages
#from string import punctuation
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.classify import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.metrics import roc_auc_score

# Load Yelp review of midland brew house
data = pd.read_csv('/content/sample_data/bonefish_grill.csv')

In [30]:
#Recode the ratings. 1-3 = Negative, 4-5 = Positive.
#Created sentiment to holds the value as negative if 3 or below and positive if there is a higher rating
data['sentiment'] = data['rating'].apply(lambda x: 'Negative' if x <= 3 else 'Positive')

In [7]:
#punkt enables the word_tokenize
nltk.download('punkt')
#stopwords enables stopwords to be removed
nltk.download('stopwords')
def clean_text(text):
    tokens = word_tokenize(text)
    #isalnum removes punctuation
    tokens = [word for word in tokens if word.isalnum()]
    #.lower converts words to lower case
    tokens = [word.lower() for word in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

#takes cleaned text and assigns it to cleaned_review column
data['cleaned_review'] = data['review'].apply(clean_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
# Split the data 70/30
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], data['sentiment'], test_size=0.3, random_state=42)

# Used to set up nltk by extracting words and making eache one it's own feature
def extract_features(text):
    words = text.split()
    return dict([(word, True) for word in words])

#sets up features and labels for training
X_train_features = [(extract_features(text), label) for text, label in zip(X_train, y_train)]
X_test_features = [(extract_features(text), label) for text, label in zip(X_test, y_test)]

# Train Naïve Bayes classifier
nb_classifier = nltk.NaiveBayesClassifier.train(X_train_features)

In [17]:
#Train Decision Tree classifier
dt_classifier = DecisionTreeClassifier.train(X_train_features)

In [10]:
# Train Logistic Regression classifier
lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(X_train_features)


<SklearnClassifier(LogisticRegression())>

In [16]:
# Test classifiers
y_pred_nb = [nb_classifier.classify(extract_features(text)) for text in X_test]
y_pred_lr = [lr_classifier.classify(extract_features(text)) for text in X_test]
y_pred_dt = [dt_classifier.classify(extract_features(text)) for text in X_test]

# Print classification reports
print("Naïve Bayes Classifier:")
print(classification_report(y_test, y_pred_nb))

print("\nLogistic Regression Classifier:")
print(classification_report(y_test, y_pred_lr))

print("\nDecision Tree Classifier:")
print(classification_report(y_test, y_pred_dt))

Naïve Bayes Classifier:
              precision    recall  f1-score   support

    Negative       0.47      0.92      0.62        37
    Positive       0.93      0.51      0.66        78

    accuracy                           0.64       115
   macro avg       0.70      0.72      0.64       115
weighted avg       0.78      0.64      0.65       115


Logistic Regression Classifier:
              precision    recall  f1-score   support

    Negative       0.57      0.57      0.57        37
    Positive       0.79      0.79      0.79        78

    accuracy                           0.72       115
   macro avg       0.68      0.68      0.68       115
weighted avg       0.72      0.72      0.72       115


Decision Tree Classifier:
              precision    recall  f1-score   support

    Negative       0.47      0.51      0.49        37
    Positive       0.76      0.73      0.75        78

    accuracy                           0.66       115
   macro avg       0.62      0.62      0.62 

In [21]:
# Print accuracy
print("\nAccuracy:")
print("Naïve Bayes Classifier:", nltk.classify.accuracy(nb_classifier, X_test_features))
print("Logistic Regression Classifier:", nltk.classify.accuracy(lr_classifier, X_test_features))
print("Decision Tree Classifier:", nltk.classify.accuracy(dt_classifier, X_test_features))



Accuracy:
Naïve Bayes Classifier: 0.6434782608695652
Logistic Regression Classifier: 0.7217391304347827
Decision Tree Classifier: 0.6608695652173913


In [31]:
# Calculate AUC for each classifier
auc_nb = roc_auc_score(y_test, [nb_classifier.prob_classify(extract_features(text)).prob('Positive') for text in X_test])
auc_lr = roc_auc_score(y_test, [lr_classifier.prob_classify(extract_features(text)).prob('Positive') for text in X_test])


# Print AUC for each classifier
print("\nAUC:")
print("Naïve Bayes Classifier:", auc_nb)
print("Logistic Regression Classifier:", auc_lr)


AUC:
Naïve Bayes Classifier: 0.7321552321552321
Logistic Regression Classifier: 0.7668052668052667


In [33]:
# This code is directly from chatgpt as I didn't understand why I couldn't get auc score for decision tree
# Get predicted labels for Decision Tree classifier
y_pred_dt = [dt_classifier.classify(extract_features(text)) for text in X_test]

# Convert predicted labels to binary values (0 or 1)
y_pred_dt_binary = [1 if label == 'Positive' else 0 for label in y_pred_dt]

# Convert actual labels to binary values (0 or 1)
y_test_binary = [1 if label == 'Positive' else 0 for label in y_test]

# Calculate AUC for Decision Tree classifier
auc_dt = roc_auc_score(y_test_binary, y_pred_dt_binary)

# Print AUC for Decision Tree classifier
print("\nAUC:")
print("Decision Tree Classifier:", auc_dt)



AUC:
Decision Tree Classifier: 0.6221413721413721


In these reviews there is class imbalance. Our goal is to have the best model at predicting whether the review is negative or positive. Therefore the best metric for comparing models is F-1 score which takes into effect both the recall and precision to deal with class imbalance. In our case the weighted average F-1 scores are Naïve Bayes Classifier: 0.65
Logistic Regression Classifier: 0.72
Decision Tree Classifier: 0.66. Therefore Logisitic Regression Classifier is the best model.

