Using the Disneyland Dataset available from kaggle at:
https://www.kaggle.com/datasets/arushchillar/disneyland-reviews

In [None]:
# How to load the dictionaries
#read_dictionary = np.load('nbDict.npy', allow_pickle='TRUE').item()

## Initialisations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from tqdm import trange
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from collections import Counter
from textblob import TextBlob

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

#for model-building
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import *

In [None]:
import warnings
warnings.filterwarnings('ignore')
nltk.download('omw-1.4', quiet=True)
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (17,7)
plt.rcParams['font.size'] = 12
sns.set_palette("Paired")

## Load Dataset

In [None]:
url='https://drive.google.com/file/d/1BEkeqcbJtVcgWbawPxJVVV2kmwRhYbTV/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
data = pd.read_csv(url)
data.head(1)

In [None]:
data = data.groupby('reviewerLocation', as_index=False).apply(lambda x: x.sample(739, random_state=64))
data = data.reset_index(drop=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Change the labels to numeric values
# 0 - Australia, 1 - United Kingdom, 2 - United States
labelEncoder = LabelEncoder()
data.reviewerLocation = labelEncoder.fit_transform(data.reviewerLocation)

## Preprocess Text

In [None]:
# Preprocessing pipeline from https://www.kaggle.com/code/balatmak/text-preprocessing-steps-and-universal-pipeline
import numpy as np
import multiprocessing as mp

import string
import spacy
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import preprocessing

nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization

        variety - format of date (AmE - american type, BrE - british format)
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(preprocessing.normalize(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [None]:
columns = list(data.columns.values)[5:]
for column in columns:
  data[column] = TextPreprocessor(n_jobs=-1).transform(data[column])

In [None]:
data.head(1)

In [None]:
data.info()

## Naive Bayes Model

In [None]:
# Naive Bayes model and evaluation
def nb_model(data, column):
  X, y = data[column], data['reviewerLocation']
  X_train, X_test, y_train, y_test = train_test_split(X, y)

  tvc_pipe = Pipeline([
       ('tvec', TfidfVectorizer()),
       ('mb', MultinomialNB())
       ])

  tvc_pipe.fit(X_train, y_train)

  # Setting up grid search params
  tf_params = {
      'tvec__max_features':[100, 2000],
      'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
      'tvec__stop_words': [None, 'english'],
      }

  # Fitting best parameters to the model
  tvc_gs = GridSearchCV(tvc_pipe, param_grid=tf_params, cv = 5, verbose =0, n_jobs = -1)
  tvc_gs.fit(X_train, y_train)

  # Model Evaluation
  tvc_gs_pred = tvc_gs.predict(X_test)
  accuracy = accuracy_score(y_test, tvc_gs_pred)
  f1 = f1_score(y_test, tvc_gs_pred, average='weighted')
  precision = precision_score(y_test, tvc_gs_pred, average='weighted')
  recall = recall_score(y_test, tvc_gs_pred, average='weighted')
  metrics = [accuracy, f1, precision, recall]
  return metrics, y_test, tvc_gs_pred

In [None]:
import statistics

def run_nb_model(data, column):
  # Run 5 times
  metrics1, trueLabels1, predLabels1 = nb_model(data, column)
  metrics2, trueLabels2, predLabels2 = nb_model(data, column)
  metrics3, trueLabels3, predLabels3 = nb_model(data, column)
  metrics4, trueLabels4, predLabels4 = nb_model(data, column)
  metrics5, trueLabels5, predLabels5 = nb_model(data, column)

  # Calculate mean and variance of each metric over each run
  metricsList = [metrics1, metrics2, metrics3, metrics4, metrics5]
  metricsMean=[]
  metricsVariance=[]
  for i in zip(*metricsList):
    metricsMean.append(sum(i)/len(i))
    metricsVariance.append(statistics.variance(i))

  # Create dataframe of y_test/ predicted labels for each run
  labels = pd.DataFrame(list(zip(trueLabels1, predLabels1, trueLabels2, predLabels2, trueLabels3, predLabels3, trueLabels4, predLabels4, trueLabels5, predLabels5,)),
                        columns=['trueLabels1', 'predLabels1', 'trueLabels2', 'predLabels2', 'trueLabels3', 'predLabels3', 'trueLabels4', 'predLabels4', 'trueLabels5', 'predLabels5'])
  return metricsMean, metricsVariance, labels

In [None]:
nb = {}
for column in columns:
  mean = column + "MetricsMean"
  var = column + "MetricsVariance"
  lab = column + "Labels"
  nb[mean], nb[var], nb[lab] = run_nb_model(data, column)
  print("Column {} Complete!".format(column))
  print(nb[mean])

In [None]:
# Save Dictionary of Results
np.save('nbDict.npy', nb)

## Logistic Regression Model

In [None]:
# Logistic Regression model and evaluation
def lr_model(data, column):
  X, y = data[column], data['reviewerLocation']
  X_train, X_test, y_train, y_test = train_test_split(X, y)

  lr_pipe = Pipeline([
       ('tvec', TfidfVectorizer()),
       ('lr', LogisticRegression())
       ])

  lr_pipe.fit(X_train, y_train)

  # Setting up grid search params
  lr_params = {
      'tvec__max_features':[2000],
      'tvec__ngram_range': [(1, 2)],
      'tvec__stop_words': ['english'],
      'lr__penalty': ['l1','l2'],
      'lr__C': [0.1, 1, 10, 100]
      }

  # Fitting best parameters to the model
  lr_gs = GridSearchCV(lr_pipe, param_grid=lr_params, cv = 5, verbose = 0, n_jobs = -1)
  lr_gs.fit(X_train, y_train)

  # Model Evaluation
  lr_gs_pred = lr_gs.predict(X_test)
  accuracy = accuracy_score(y_test, lr_gs_pred)
  f1 = f1_score(y_test, lr_gs_pred, average='weighted')
  precision = precision_score(y_test, lr_gs_pred, average='weighted')
  recall = recall_score(y_test, lr_gs_pred, average='weighted')
  metrics = [accuracy, f1, precision, recall]
  return metrics, y_test, lr_gs_pred

In [None]:
import statistics

def run_lr_model(data, column):
  # Run 5 times
  metrics1, trueLabels1, predLabels1 = lr_model(data, column)
  metrics2, trueLabels2, predLabels2 = lr_model(data, column)
  metrics3, trueLabels3, predLabels3 = lr_model(data, column)
  metrics4, trueLabels4, predLabels4 = lr_model(data, column)
  metrics5, trueLabels5, predLabels5 = lr_model(data, column)

  # Calculate mean and variance of each metric over each run
  metricsList = [metrics1, metrics2, metrics3, metrics4, metrics5]
  metricsMean=[]
  metricsVariance=[]
  for i in zip(*metricsList):
    metricsMean.append(sum(i)/len(i))
    metricsVariance.append(statistics.variance(i))

  # Create dataframe of y_test/ predicted labels for each run
  labels = pd.DataFrame(list(zip(trueLabels1, predLabels1, trueLabels2, predLabels2, trueLabels3, predLabels3, trueLabels4, predLabels4, trueLabels5, predLabels5,)),
                        columns=['trueLabels1', 'predLabels1', 'trueLabels2', 'predLabels2', 'trueLabels3', 'predLabels3', 'trueLabels4', 'predLabels4', 'trueLabels5', 'predLabels5'])
  return metricsMean, metricsVariance, labels

In [None]:
lr = {}
for column in columns:
  mean = column + "MetricsMean"
  var = column + "MetricsVariance"
  lab = column + "Labels"
  lr[mean], lr[var], lr[lab] = run_lr_model(data, column)
  print("Column {} Complete!".format(column))
  print(lr[mean])

In [None]:
# Save Dictionary of Results
np.save('lrDict.npy', lr)

## Support Vector Machine Model

In [None]:
# Random Forest model and evaluation
def svm_model(data, column):
  X, y = data[column], data['reviewerLocation']
  X_train, X_test, y_train, y_test = train_test_split(X, y)

  svm_pipe = Pipeline([
       ('tvec', TfidfVectorizer()),
       ('svm', SVC())
       ])

  svm_pipe.fit(X_train, y_train)

  # Setting up grid search params
  svm_params = {
      'tvec__max_features':[2000],
      'tvec__ngram_range': [(1, 2)],
      'tvec__stop_words': ['english'],
      'svm__C': [0.1, 1, 10, 100],
      'svm__gamma': [1, 0.1, 0.01, 0.001],
      'svm__kernel': ['rbf']
      }

  # Fitting best parameters to the model
  svm_gs = GridSearchCV(svm_pipe, param_grid=svm_params, cv = 5, verbose = 0, n_jobs = -1)
  svm_gs.fit(X_train, y_train)

  # Model Evaluation
  svm_gs_pred = svm_gs.predict(X_test)
  accuracy = accuracy_score(y_test, svm_gs_pred)
  f1 = f1_score(y_test, svm_gs_pred, average='weighted')
  precision = precision_score(y_test, svm_gs_pred, average='weighted')
  recall = recall_score(y_test, svm_gs_pred, average='weighted')
  metrics = [accuracy, f1, precision, recall]
  return metrics, y_test, svm_gs_pred

In [None]:
import statistics

def run_svm_model(data, column):
  # Run 5 times
  metrics1, trueLabels1, predLabels1 = svm_model(data, column)
  metrics2, trueLabels2, predLabels2 = svm_model(data, column)
  metrics3, trueLabels3, predLabels3 = svm_model(data, column)
  metrics4, trueLabels4, predLabels4 = svm_model(data, column)
  metrics5, trueLabels5, predLabels5 = svm_model(data, column)

  # Calculate mean and variance of each metric over each run
  metricsList = [metrics1, metrics2, metrics3, metrics4, metrics5]
  metricsMean=[]
  metricsVariance=[]
  for i in zip(*metricsList):
    metricsMean.append(sum(i)/len(i))
    metricsVariance.append(statistics.variance(i))

  # Create dataframe of y_test/ predicted labels for each run
  labels = pd.DataFrame(list(zip(trueLabels1, predLabels1, trueLabels2, predLabels2, trueLabels3, predLabels3, trueLabels4, predLabels4, trueLabels5, predLabels5,)),
                        columns=['trueLabels1', 'predLabels1', 'trueLabels2', 'predLabels2', 'trueLabels3', 'predLabels3', 'trueLabels4', 'predLabels4', 'trueLabels5', 'predLabels5'])
  return metricsMean, metricsVariance, labels

In [None]:
svm = {}
for column in columns:
  mean = column + "MetricsMean"
  var = column + "MetricsVariance"
  lab = column + "Labels"
  svm[mean], svm[var], svm[lab] = run_svm_model(data, column)
  print("Column {} Complete!".format(column))
  print(svm[mean])

In [None]:
# Save Dictionary of Results
np.save('svmDict.npy', svm)

## Random Forest Model

In [None]:
# Random Forest model and evaluation
def rf_model(data, column):
  X, y = data[column], data['reviewerLocation']
  X_train, X_test, y_train, y_test = train_test_split(X, y)

  rf_pipe = Pipeline([
       ('tvec', TfidfVectorizer()),
       ('rf', RandomForestClassifier())
       ])

  rf_pipe.fit(X_train, y_train)

  # Setting up grid search params
  rf_params = {
      'tvec__max_features':[2000],
      'tvec__ngram_range': [(1, 2)],
      'tvec__stop_words': ['english'],
      'rf__max_depth': [1000],
      'rf__min_samples_split': [100],
      'rf__max_leaf_nodes': [None]
      }

  # Fitting best parameters to the model
  rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, cv = 5, verbose = 0, n_jobs = -1)
  rf_gs.fit(X_train, y_train)

  # Model Evaluation
  rf_gs_pred = rf_gs.predict(X_test)
  accuracy = accuracy_score(y_test, rf_gs_pred)
  f1 = f1_score(y_test, rf_gs_pred, average='weighted')
  precision = precision_score(y_test, rf_gs_pred, average='weighted')
  recall = recall_score(y_test, rf_gs_pred, average='weighted')
  metrics = [accuracy, f1, precision, recall]
  return metrics, y_test, rf_gs_pred

In [None]:
import statistics

def run_rf_model(data, column):
  # Run 5 times
  metrics1, trueLabels1, predLabels1 = rf_model(data, column)
  metrics2, trueLabels2, predLabels2 = rf_model(data, column)
  metrics3, trueLabels3, predLabels3 = rf_model(data, column)
  metrics4, trueLabels4, predLabels4 = rf_model(data, column)
  metrics5, trueLabels5, predLabels5 = rf_model(data, column)

  # Calculate mean and variance of each metric over each run
  metricsList = [metrics1, metrics2, metrics3, metrics4, metrics5]
  metricsMean=[]
  metricsVariance=[]
  for i in zip(*metricsList):
    metricsMean.append(sum(i)/len(i))
    metricsVariance.append(statistics.variance(i))

  # Create dataframe of y_test/ predicted labels for each run
  labels = pd.DataFrame(list(zip(trueLabels1, predLabels1, trueLabels2, predLabels2, trueLabels3, predLabels3, trueLabels4, predLabels4, trueLabels5, predLabels5,)),
                        columns=['trueLabels1', 'predLabels1', 'trueLabels2', 'predLabels2', 'trueLabels3', 'predLabels3', 'trueLabels4', 'predLabels4', 'trueLabels5', 'predLabels5'])
  return metricsMean, metricsVariance, labels

In [None]:
rf = {}
for column in columns:
  mean = column + "MetricsMean"
  var = column + "MetricsVariance"
  lab = column + "Labels"
  rf[mean], rf[var], rf[lab] = run_rf_model(data, column)
  print("Column {} Complete!".format(column))
  print(rf[mean])

In [None]:
# Save Dictionary of Results
np.save('rfDict.npy', rf)

## Gradient Boosting Model

In [None]:
# Gradient Boosting model and evaluation
def xg_model(data, column):
  X, y = data[column], data['reviewerLocation']
  X_train, X_test, y_train, y_test = train_test_split(X, y)
  xg_pipe = Pipeline([
       ('tvec', TfidfVectorizer()),
       ('xg', XGBClassifier())
       ])

  xg_pipe.fit(X_train, y_train)

  # Setting up grid search params
  xg_params = {
      'tvec__max_features':[2000],
      'tvec__ngram_range': [(1, 2)],
      'tvec__stop_words': ['english'],
      'xg__n_estimators': [50, 100],
      'xg__learning_rate': [0.1]
      }

  # Fitting best parameters to the model
  xg_gs = GridSearchCV(xg_pipe, param_grid=xg_params, cv = 5, verbose = 0, n_jobs = -1)
  xg_gs.fit(X_train, y_train)

  # Model Evaluation
  xg_gs_pred = xg_gs.predict(X_test)
  accuracy = accuracy_score(y_test, xg_gs_pred)
  f1 = f1_score(y_test, xg_gs_pred, average='weighted')
  precision = precision_score(y_test, xg_gs_pred, average='weighted')
  recall = recall_score(y_test, xg_gs_pred, average='weighted')
  metrics = [accuracy, f1, precision, recall]
  return metrics, y_test, xg_gs_pred

In [None]:
import statistics

def run_xg_model(data, column):
  # Run 5 times
  metrics1, trueLabels1, predLabels1 = xg_model(data, column)
  metrics2, trueLabels2, predLabels2 = xg_model(data, column)
  metrics3, trueLabels3, predLabels3 = xg_model(data, column)
  metrics4, trueLabels4, predLabels4 = xg_model(data, column)
  metrics5, trueLabels5, predLabels5 = xg_model(data, column)

  # Calculate mean and variance of each metric over each run
  metricsList = [metrics1, metrics2, metrics3, metrics4, metrics5]
  metricsMean=[]
  metricsVariance=[]
  for i in zip(*metricsList):
    metricsMean.append(sum(i)/len(i))
    metricsVariance.append(statistics.variance(i))

  # Create dataframe of y_test/ predicted labels for each run
  labels = pd.DataFrame(list(zip(trueLabels1, predLabels1, trueLabels2, predLabels2, trueLabels3, predLabels3, trueLabels4, predLabels4, trueLabels5, predLabels5,)),
                        columns=['trueLabels1', 'predLabels1', 'trueLabels2', 'predLabels2', 'trueLabels3', 'predLabels3', 'trueLabels4', 'predLabels4', 'trueLabels5', 'predLabels5'])
  return metricsMean, metricsVariance, labels

In [None]:
xg = {}
for column in columns:
  mean = column + "MetricsMean"
  var = column + "MetricsVariance"
  lab = column + "Labels"
  xg[mean], xg[var], xg[lab] = run_xg_model(data, column)
  print("Column {} Complete!".format(column))
  print(xg[mean])

In [None]:
# Save Dictionary of Results
np.save('xgDict.npy', xg)

## Word Importance

In [None]:
# Quick fit of pipelines outside of def for both original and summaries
X, y = data['originalReview'], data['reviewerLocation']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=64)

tvc_or_pipe = Pipeline([
     ('tvec', TfidfVectorizer()),
     ('mb', MultinomialNB())
     ])
tvc_or_pipe.fit(X_train, y_train)

rf_or_pipe = Pipeline([
     ('tvec', TfidfVectorizer()),
     ('rf', RandomForestClassifier())
     ])
rf_or_pipe.fit(X_train, y_train)

In [None]:
X, y = data['summary'], data['reviewerLocation']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=64)

tvc_su_pipe = Pipeline([
     ('tvec', TfidfVectorizer()),
     ('mb', MultinomialNB())
     ])
tvc_su_pipe.fit(X_train, y_train)

rf_su_pipe = Pipeline([
     ('tvec', TfidfVectorizer()),
     ('rf', RandomForestClassifier())
     ])
rf_su_pipe.fit(X_train, y_train)

In [None]:
original_title = pd.DataFrame(rf_or_pipe.steps[1][1].feature_importances_, tvc_or_pipe.steps[0][1].get_feature_names_out(), columns=['originalImportance'])
original_title = original_title.sort_values('originalImportance', ascending = False).head(20)
original_title = original_title.reset_index()
original_title = original_title.rename(columns={"index": "originalWord"})

summary_title = pd.DataFrame(rf_su_pipe.steps[1][1].feature_importances_, tvc_su_pipe.steps[0][1].get_feature_names_out(), columns=['summaryImportance'])
summary_title = summary_title.sort_values('summaryImportance', ascending = False).head(20)
summary_title = summary_title.reset_index()
summary_title = summary_title.rename(columns={"index": "summaryWord"})

result = pd.concat([original_title, summary_title], axis=1)
result

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(8, 9), constrained_layout=True, sharex=True)
fig.suptitle('Word Importance Comparison')

sns.barplot(x=result['originalImportance'], y=result['originalWord'], palette='Paired', ax=axes[0])
axes[0].set_title('Top 20 Most Important Words For Classification of Original Reviews')

sns.barplot(x=result['summaryImportance'], y=result['summaryWord'], palette='Paired', ax=axes[1])
axes[1].set_title('Top 20 Most Important Words For Classification of Summary Reviews')