In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from functools import reduce
import re
import numpy as np

In [2]:
reviews_df = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [3]:
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def clean_review_text(row):
    conjoined_regex = r'(\w{2,}[\.\*\'/]\w{2,})\b'
    trailing_dash_comma_regex = r'(\w+[-,])\s'
    trailing_period_regex = r'(\w+\.)\s'
    review = row['Review']
    
    # 1. Split conjoined words
    conjoined_match = re.search(conjoined_regex, review)
    if conjoined_match:
       for word in conjoined_match.groups():
           review = review.replace(word, word.replace('.', ' '))
           review = review.replace(word, word.replace('*', ' '))
           review = review.replace(word, word.replace('/', ' '))
           review = review.replace(word, word.replace('\'', ' '))
    
    # 2a. Remove n't and not
    review = review.replace('n\'t', '').replace('not', '')

    # 2. Remove trailing dashes, commas
    trailing_dash_comma_match = re.search(trailing_dash_comma_regex, review)
    if trailing_dash_comma_match:
       for word in trailing_dash_comma_match.groups():
           review = review.replace(word, word.rstrip('-,'))
            
    # 3. Remove numbers
    review = ' '.join([word for word in review.split() if not word.isdigit()])

    
    
    # 4a. Lemmatization using WordNetLemmatizer
    review = ' '.join([lemmatizer.lemmatize(w, 'v') for w in review.split()])
    
    # 4b. Stemming
    # review = ' '.join([stemmer.stem(w) for w in review.split()])
    
    return review
    
reviews_df["Review"] = reviews_df.apply(clean_review_text, axis=1)

In [9]:
def map_rating_to_sentiment(row):
    rating = row['Rating']
    if rating in [1,2]:
        return "Negative"
    elif rating in [3]:
        return "Neutral"
    elif rating in [4,5]:
        return "Positive"
    return "Unknown"
def map_rating_to_sentiment_score(row):
    rating = row['Rating']
    if rating in [1,2]:
        return 1
    elif rating in [3]:
        return 2
    elif rating in [4,5]:
        return 3
    return -1
reviews_df['Sentiment'] = reviews_df.apply(map_rating_to_sentiment, axis=1)
reviews_df['Sentiment Score'] = reviews_df.apply(map_rating_to_sentiment_score, axis=1)

In [10]:
vectorizer = TfidfVectorizer(
    min_df = 5,          # Minimum document frequency (i.e. ignore all words with less than 5 occurrences)
    max_df = 0.8,        # Maximum document frequency (i.e. ignore all words that account for 80% of the corpus size)
    sublinear_tf = True, # Apply sublinear term frequency scaling
    ngram_range=(1,3)    
)

In [11]:
vectorizer = vectorizer.fit(reviews_df["Review"])

In [12]:
no_of_reviews = len(reviews_df)
sections = [int(0.8 * no_of_reviews), int(0.9 * no_of_reviews)]

reviews_train, reviews_test, reviews_val = np.split(
    ary = reviews_df["Review"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)
X_train, X_test, X_val = (
    vectorizer.transform(reviews_train),
    vectorizer.transform(reviews_test),
    vectorizer.transform(reviews_val),
)
y_rating_train, y_rating_test, y_rating_val = np.split(
    ary = reviews_df["Rating"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)
y_sentiment_train, y_sentiment_test, y_sentiment_val = np.split(
    ary = reviews_df["Sentiment Score"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)

In [16]:
from sklearn.naive_bayes import MultinomialNB
import time

In [19]:
print(f'Training Naive Bayes (Rating)...')
rating_train_start = time.perf_counter()
rating_model = MultinomialNB(force_alpha=True)
rating_model.fit(X_train, y_rating_train)
rating_train_end = time.perf_counter()
print(f'- Training Time: {rating_train_end - rating_train_start:.2f}s\n')

Training Naive Bayes (Rating)...
- Training Time: 0.03s



In [20]:
print(f'Training Naive Bayes (Sentiment)...')
sentiment_train_start = time.perf_counter()
sentiment_model = MultinomialNB(force_alpha=True)
sentiment_model.fit(X_train, y_sentiment_train)
sentiment_train_end = time.perf_counter()
print(f'- Training Time: {rating_train_end - rating_train_start:.2f}s\n')

Training Naive Bayes (Sentiment)...
- Training Time: 0.03s



In [22]:
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [25]:
test_start = time.perf_counter()
print(f'\nTesting Naive Bayes Model (Rating) on Test Data...')
test_predictions = rating_model.predict(X_test)
test_end = time.perf_counter()
test_accuracy = accuracy_score(y_rating_test, test_predictions)
test_f1 = f1_score(y_rating_test, test_predictions, average='weighted', zero_division=0)
print(f'Performance:')
print(f'- Accuracy: {test_accuracy*100:.2f}')
print(f'- F1: {test_f1*100:.2f}')
print(f'- Test Prediction Time: {test_end - test_start:.2f}s')
print(f'- Classification Report:')
print(classification_report(y_rating_test, test_predictions, zero_division=0))
print()


Testing Naive Bayes Model (Rating) on Test Data...
Performance:
- Accuracy: 53.64
- F1: 41.79
- Test Prediction Time: 0.00s
- Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       103
           2       0.00      0.00      0.00       143
           3       0.00      0.00      0.00       207
           4       0.36      0.15      0.22       569
           5       0.56      0.98      0.71      1027

    accuracy                           0.54      2049
   macro avg       0.18      0.23      0.19      2049
weighted avg       0.38      0.54      0.42      2049




In [29]:
test_start = time.perf_counter()
print(f'\nTesting Naive Bayes Model (Sentiment) on Test Data...')
test_predictions = sentiment_model.predict(X_test)
test_end = time.perf_counter()
test_accuracy = accuracy_score(y_sentiment_test, test_predictions)
test_f1 = f1_score(y_sentiment_test, test_predictions, average='weighted', zero_division=0)
print(f'Performance:')
print(f'- Accuracy: {test_accuracy*100:.2f}')
print(f'- F1: {test_f1*100:.2f}')
print(f'- Test Prediction Time: {test_end - test_start:.2f}s')
print(f'- Classification Report:')
print(classification_report(y_sentiment_test, test_predictions, zero_division=0))
print()


Testing Naive Bayes Model (Sentiment) on Test Data...
Performance:
- Accuracy: 79.21
- F1: 71.11
- Test Prediction Time: 0.00s
- Classification Report:
              precision    recall  f1-score   support

           1       0.96      0.11      0.20       246
           2       0.00      0.00      0.00       207
           3       0.79      1.00      0.88      1596

    accuracy                           0.79      2049
   macro avg       0.58      0.37      0.36      2049
weighted avg       0.73      0.79      0.71      2049




In [33]:
val_start = time.perf_counter()
print(f'\nTesting Naive Bayes Model (Rating) on Validation Data...')
val_predictions = rating_model.predict(X_val)
val_end = time.perf_counter()
val_accuracy = accuracy_score(y_rating_val, val_predictions)
val_f1 = f1_score(y_rating_val, val_predictions, average='weighted', zero_division=0)
print(f'Performance:')
print(f'- Accuracy: {val_accuracy*100:.2f}')
print(f'- F1: {val_f1*100:.2f}')
print(f'- Test Prediction Time: {val_end - val_start:.2f}s')
print(f'- Classification Report:')
print(classification_report(y_rating_val, val_predictions, zero_division=0))
print()


Testing Naive Bayes Model (Rating) on Validation Data...
Performance:
- Accuracy: 51.02
- F1: 38.97
- Test Prediction Time: 0.01s
- Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       114
           2       0.00      0.00      0.00       166
           3       0.00      0.00      0.00       204
           4       0.32      0.15      0.20       585
           5       0.54      0.98      0.69       981

    accuracy                           0.51      2050
   macro avg       0.17      0.22      0.18      2050
weighted avg       0.35      0.51      0.39      2050




In [35]:
val_start = time.perf_counter()
print(f'\nTesting Naive Bayes Model (Sentiment) on Validation Data...')
val_predictions = sentiment_model.predict(X_val)
val_end = time.perf_counter()
val_accuracy = accuracy_score(y_sentiment_val, val_predictions)
val_f1 = f1_score(y_sentiment_val, val_predictions, average='weighted', zero_division=0)
print(f'Performance:')
print(f'- Accuracy: {val_accuracy*100:.2f}')
print(f'- F1: {val_f1*100:.2f}')
print(f'- Test Prediction Time: {val_end - val_start:.2f}s')
print(f'- Classification Report:')
print(classification_report(y_sentiment_val, val_predictions, zero_division=0))
print()


Testing Naive Bayes Model (Sentiment) on Validation Data...
Performance:
- Accuracy: 77.61
- F1: 68.87
- Test Prediction Time: 0.00s
- Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.09      0.16       280
           2       0.00      0.00      0.00       204
           3       0.77      1.00      0.87      1566

    accuracy                           0.78      2050
   macro avg       0.59      0.36      0.35      2050
weighted avg       0.73      0.78      0.69      2050




## Evaluation of Naive Bayes on Other Datasets

In [36]:
fine_food_df = pd.read_csv("./amazon_fine_food_reviews.csv")
play_store_df = pd.read_csv("./google_play_store_reviews.csv")
clothing_df = pd.read_csv("./Womens Clothing E-Commerce Reviews.csv")
mobile_phones_df = pd.read_csv("./Amazon_Unlocked_Mobile.csv")

In [37]:
cleaned_clothing_df = clothing_df.dropna(subset=["Review Text"])
cleaned_clothing_df.info()

cleaned_mobile_phones_df = mobile_phones_df.dropna(subset=["Reviews"])
cleaned_mobile_phones_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22641 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               22641 non-null  int64 
 1   Clothing ID              22641 non-null  int64 
 2   Age                      22641 non-null  int64 
 3   Title                    19675 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   22641 non-null  int64 
 6   Recommended IND          22641 non-null  int64 
 7   Positive Feedback Count  22641 non-null  int64 
 8   Division Name            22628 non-null  object
 9   Department Name          22628 non-null  object
 10  Class Name               22628 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.1+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 413778 entries, 0 to 413839
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 

In [38]:
fine_food_vectorized = vectorizer.transform(fine_food_df["Text"])
play_store_vectorized = vectorizer.transform(play_store_df["content"])
clothing_vectorized = vectorizer.transform(cleaned_clothing_df["Review Text"])
mobile_phones_vectorized = vectorizer.transform(cleaned_mobile_phones_df["Reviews"])

y_rating_ff = fine_food_df["Score"]
y_rating_ps = play_store_df["score"]
y_rating_cl = cleaned_clothing_df["Rating"]
y_rating_mp = cleaned_mobile_phones_df["Rating"]

In [39]:
def map_rating_to_sentiment_score(rating):
    if rating in [1,2]:
        return 1
    elif rating in [3]:
        return 2
    elif rating in [4,5]:
        return 3
    return -1

y_sentiment_ff = y_rating_ff.apply(map_rating_to_sentiment_score)    
y_sentiment_ps = y_rating_ps.apply(map_rating_to_sentiment_score)
y_sentiment_cl = y_rating_cl.apply(map_rating_to_sentiment_score)    
y_sentiment_mp = y_rating_mp.apply(map_rating_to_sentiment_score)

In [40]:
dataset_names = [
    'Amazon Fine Food Reviews',
    'Google Play Store Reviews',
    'Women\'s E-Commerce Clothing Reviews',
    'Amazon Reviews of Unlocked Mobile Phones'
]
dataset_x = {
    'Amazon Fine Food Reviews': fine_food_vectorized,
    'Google Play Store Reviews': play_store_vectorized,
    'Women\'s E-Commerce Clothing Reviews': clothing_vectorized,
    'Amazon Reviews of Unlocked Mobile Phones': mobile_phones_vectorized
}
dataset_y_rating = {
    'Amazon Fine Food Reviews': y_rating_ff,
    'Google Play Store Reviews': y_rating_ps,
    'Women\'s E-Commerce Clothing Reviews': y_rating_cl,
    'Amazon Reviews of Unlocked Mobile Phones': y_rating_mp
}
dataset_y_sentiment = {
    'Amazon Fine Food Reviews': y_sentiment_ff,
    'Google Play Store Reviews': y_sentiment_ps,
    'Women\'s E-Commerce Clothing Reviews': y_sentiment_cl,
    'Amazon Reviews of Unlocked Mobile Phones': y_sentiment_mp
}

In [43]:
for dataset_name in dataset_names:
    X_dataset = dataset_x[dataset_name]
    y_rating_dataset = dataset_y_rating[dataset_name]
    y_sentiment_dataset = dataset_y_sentiment[dataset_name]
    
    ds_start = time.perf_counter()
    print(f'\nTesting Naive Bayes Model (Rating) on {dataset_name}...')
    ds_predictions = rating_model.predict(X_dataset)
    ds_end = time.perf_counter()
    ds_accuracy = accuracy_score(y_rating_dataset, ds_predictions)
    ds_f1 = f1_score(y_rating_dataset, ds_predictions, average='weighted', zero_division=0)
    print(f'Performance:')
    print(f'- Accuracy: {ds_accuracy*100:.2f}')
    print(f'- F1: {ds_f1*100:.2f}')
    print(f'- Test Prediction Time: {ds_end - ds_start:.2f}s')
    print(f'- Classification Report:')
    print(classification_report(y_rating_dataset, ds_predictions, zero_division=0))
    print()


    ds_start = time.perf_counter()
    print(f'\nTesting Naive Bayes Model (Sentiment) on {dataset_name}...')
    ds_predictions = sentiment_model.predict(X_dataset)
    ds_end = time.perf_counter()
    ds_accuracy = accuracy_score(y_sentiment_dataset, ds_predictions)
    ds_f1 = f1_score(y_sentiment_dataset, ds_predictions, average='weighted', zero_division=0)
    print(f'Performance:')
    print(f'- Accuracy: {ds_accuracy*100:.2f}')
    print(f'- F1: {ds_f1*100:.2f}')
    print(f'- Test Prediction Time: {ds_end - ds_start:.2f}s')
    print(f'- Classification Report:')
    print(classification_report(y_sentiment_dataset, ds_predictions, zero_division=0))
    print()


Testing Naive Bayes Model (Rating) on Amazon Fine Food Reviews...
Performance:
- Accuracy: 63.27
- F1: 50.91
- Test Prediction Time: 0.17s
- Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00     52268
           2       0.00      0.00      0.00     29769
           3       0.00      0.00      0.00     42640
           4       0.22      0.05      0.08     80655
           5       0.65      0.98      0.78    363122

    accuracy                           0.63    568454
   macro avg       0.17      0.21      0.17    568454
weighted avg       0.44      0.63      0.51    568454



Testing Naive Bayes Model (Sentiment) on Amazon Fine Food Reviews...
Performance:
- Accuracy: 78.07
- F1: 68.45
- Test Prediction Time: 0.11s
- Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.00      0.00     82037
           2       0.00      0.00      0.00     42640
           3  