# Decision Tree with Extensive Hyperparameter Tuning

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from functools import reduce
import re
import numpy as np

In [2]:
# Load the reviews.csv file into a Pandas dataframe
reviews_df = pd.read_csv('tripadvisor_hotel_reviews.csv')

In [3]:
def clean_review_info(text):
    text = re.sub(r'[0-9]+', '', text)
    repls = ('.', ' '), ('did n\'t', 'didn\'t'), ('wo n\'t', 'won\'t'), ('do n\'t', 'don\'t'), ('n\'t', ''), ('*', ''), (',', ' '), ('\'', ' '), ('-', ' ')
    return reduce(lambda a, kv: a.replace(*kv), repls, text)

def recode_score(score):
    if score in [1, 2]:
        return 1
    elif score == 3:
        return 2
    elif score in [4, 5]:
        return 3

In [4]:
# Extract the review text and the corresponding scores from the dataframe
reviews_df['Sentiment Score'] = reviews_df['Rating'].apply(recode_score)
reviews_df['Review'] = reviews_df['Review'].apply(clean_review_info)

In [5]:
vectorizer = TfidfVectorizer(
    min_df = 5,          # Minimum document frequency (i.e. ignore all words with less than 5 occurrences)
    max_df = 0.8,        # Maximum document frequency (i.e. ignore all words that account for 80% of the corpus size)
    sublinear_tf = True, # Apply sublinear term frequency scaling
    ngram_range=(1,3)    
)

In [6]:
no_of_reviews = len(reviews_df)
sections = [int(0.8 * no_of_reviews), int(0.9 * no_of_reviews)]

reviews_train, reviews_test, reviews_val = np.split(
    ary = reviews_df["Review"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)
vectorizer.fit(reviews_train)
X_train, X_test, X_val = (
    vectorizer.transform(reviews_train),
    vectorizer.transform(reviews_test),
    vectorizer.transform(reviews_val),
)
y_rating_train, y_rating_test, y_rating_val = np.split(
    ary = reviews_df["Rating"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)
y_sentiment_train, y_sentiment_test, y_sentiment_val = np.split(
    ary = reviews_df["Sentiment Score"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)

In [7]:
# Define the hyperparameters to tune
params = {'max_depth': [5, 10, 20, None],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 2, 4]}

In [8]:
# Create an instance of the DecisionTreeClassifier class
rating_clf = DecisionTreeClassifier()

# Create a GridSearchCV object to search over the hyperparameters
dt_rating_grid_search = GridSearchCV(rating_clf, params, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
dt_rating_grid_search.fit(X_train, y_rating_train)

# Use the best estimator to make predictions on the testing data
dt_rating_best_clf = dt_rating_grid_search.best_estimator_
dt_rating_test_predictions = dt_rating_best_clf.predict(X_test)
print(dt_rating_test_predictions, end='\n\n')
# Print the classification report
print(classification_report(y_rating_test, dt_rating_test_predictions))

[5 5 3 ... 4 4 4]

              precision    recall  f1-score   support

           1       0.35      0.34      0.34       103
           2       0.25      0.09      0.13       143
           3       0.20      0.16      0.18       207
           4       0.37      0.38      0.38       569
           5       0.65      0.72      0.69      1027

    accuracy                           0.51      2049
   macro avg       0.37      0.34      0.34      2049
weighted avg       0.49      0.51      0.49      2049



In [9]:
# Create an instance of the DecisionTreeClassifier class
sentiment_clf = DecisionTreeClassifier()

# Create a GridSearchCV object to search over the hyperparameters
dt_sentiment_grid_search = GridSearchCV(sentiment_clf, params, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
dt_sentiment_grid_search.fit(X_train, y_sentiment_train)

# Use the best estimator to make predictions on the testing data
dt_sentiment_best_clf = dt_sentiment_grid_search.best_estimator_
dt_sentiment_test_predictions = dt_sentiment_best_clf.predict(X_test)
print(dt_sentiment_test_predictions, end='\n\n')
# Print the classification report
print(classification_report(y_sentiment_test, dt_sentiment_test_predictions))

[3 3 1 ... 3 3 3]

              precision    recall  f1-score   support

           1       0.50      0.44      0.47       246
           2       0.31      0.10      0.15       207
           3       0.85      0.94      0.89      1596

    accuracy                           0.79      2049
   macro avg       0.55      0.49      0.50      2049
weighted avg       0.75      0.79      0.76      2049



In [10]:
dt_rating_val_predictions = dt_rating_best_clf.predict(X_val)
print(dt_rating_val_predictions, end='\n\n')
# Print the classification report
print(classification_report(y_rating_val, dt_rating_val_predictions))

[4 5 5 ... 4 2 5]

              precision    recall  f1-score   support

           1       0.37      0.45      0.41       114
           2       0.32      0.11      0.16       166
           3       0.23      0.17      0.20       204
           4       0.40      0.39      0.39       585
           5       0.64      0.75      0.69       981

    accuracy                           0.52      2050
   macro avg       0.39      0.37      0.37      2050
weighted avg       0.49      0.52      0.50      2050



In [11]:
dt_sentiment_val_predictions = dt_sentiment_best_clf.predict(X_val)
print(dt_sentiment_val_predictions, end='\n\n')
# Print the classification report
print(classification_report(y_sentiment_val, dt_sentiment_val_predictions))

[1 3 3 ... 1 3 3]

              precision    recall  f1-score   support

           1       0.56      0.52      0.54       280
           2       0.19      0.05      0.08       204
           3       0.85      0.93      0.89      1566

    accuracy                           0.79      2050
   macro avg       0.53      0.50      0.50      2050
weighted avg       0.74      0.79      0.76      2050



## Evaluation of Decision Tree on Other Datasets

In [12]:
fine_food_df = pd.read_csv("./amazon_fine_food_reviews.csv")
play_store_df = pd.read_csv("./google_play_store_reviews.csv")
clothing_df = pd.read_csv("./Womens Clothing E-Commerce Reviews.csv")
mobile_phones_df = pd.read_csv("./Amazon_Unlocked_Mobile.csv")

In [13]:
cleaned_clothing_df = clothing_df.dropna(subset=["Review Text"])
cleaned_clothing_df.info()

cleaned_mobile_phones_df = mobile_phones_df.dropna(subset=["Reviews"])
cleaned_mobile_phones_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22641 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               22641 non-null  int64 
 1   Clothing ID              22641 non-null  int64 
 2   Age                      22641 non-null  int64 
 3   Title                    19675 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   22641 non-null  int64 
 6   Recommended IND          22641 non-null  int64 
 7   Positive Feedback Count  22641 non-null  int64 
 8   Division Name            22628 non-null  object
 9   Department Name          22628 non-null  object
 10  Class Name               22628 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.1+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 413778 entries, 0 to 413839
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 

In [14]:
fine_food_vectorized = vectorizer.transform(fine_food_df["Text"])
play_store_vectorized = vectorizer.transform(play_store_df["content"])
clothing_vectorized = vectorizer.transform(cleaned_clothing_df["Review Text"])
mobile_phones_vectorized = vectorizer.transform(cleaned_mobile_phones_df["Reviews"])

y_rating_ff = fine_food_df["Score"]
y_rating_ps = play_store_df["score"]
y_rating_cl = cleaned_clothing_df["Rating"]
y_rating_mp = cleaned_mobile_phones_df["Rating"]

In [15]:
def map_rating_to_sentiment_score(rating):
    if rating in [1,2]:
        return 1
    elif rating in [3]:
        return 2
    elif rating in [4,5]:
        return 3
    return -1

y_sentiment_ff = y_rating_ff.apply(map_rating_to_sentiment_score)    
y_sentiment_ps = y_rating_ps.apply(map_rating_to_sentiment_score)
y_sentiment_cl = y_rating_cl.apply(map_rating_to_sentiment_score)    
y_sentiment_mp = y_rating_mp.apply(map_rating_to_sentiment_score)

In [16]:
dataset_names = [
    'Amazon Fine Food Reviews',
    'Google Play Store Reviews',
    'Women\'s E-Commerce Clothing Reviews',
    'Amazon Reviews of Unlocked Mobile Phones'
]
dataset_x = {
    'Amazon Fine Food Reviews': fine_food_vectorized,
    'Google Play Store Reviews': play_store_vectorized,
    'Women\'s E-Commerce Clothing Reviews': clothing_vectorized,
    'Amazon Reviews of Unlocked Mobile Phones': mobile_phones_vectorized
}
dataset_y_rating = {
    'Amazon Fine Food Reviews': y_rating_ff,
    'Google Play Store Reviews': y_rating_ps,
    'Women\'s E-Commerce Clothing Reviews': y_rating_cl,
    'Amazon Reviews of Unlocked Mobile Phones': y_rating_mp
}
dataset_y_sentiment = {
    'Amazon Fine Food Reviews': y_sentiment_ff,
    'Google Play Store Reviews': y_sentiment_ps,
    'Women\'s E-Commerce Clothing Reviews': y_sentiment_cl,
    'Amazon Reviews of Unlocked Mobile Phones': y_sentiment_mp
}

In [20]:
import time
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [22]:
for dataset_name in dataset_names:
    X_dataset = dataset_x[dataset_name]
    y_rating_dataset = dataset_y_rating[dataset_name]
    y_sentiment_dataset = dataset_y_sentiment[dataset_name]
    
    ds_start = time.perf_counter()
    print(f'\nTesting Decision Tree Model (Rating) on {dataset_name}...')
    ds_predictions = dt_rating_best_clf.predict(X_dataset)
    ds_end = time.perf_counter()
    ds_accuracy = accuracy_score(y_rating_dataset, ds_predictions)
    ds_f1 = f1_score(y_rating_dataset, ds_predictions, average='weighted', zero_division=0)
    print(f'Performance:')
    print(f'- Accuracy: {ds_accuracy*100:.2f}')
    print(f'- F1: {ds_f1*100:.2f}')
    print(f'- Test Prediction Time: {ds_end - ds_start:.2f}s')
    print(f'- Classification Report:')
    print(classification_report(y_rating_dataset, ds_predictions, zero_division=0))
    print()


    ds_start = time.perf_counter()
    print(f'\nTesting Decision Tree Model (Sentiment) on {dataset_name}...')
    ds_predictions = dt_sentiment_best_clf.predict(X_dataset)
    ds_end = time.perf_counter()
    ds_accuracy = accuracy_score(y_sentiment_dataset, ds_predictions)
    ds_f1 = f1_score(y_sentiment_dataset, ds_predictions, average='weighted', zero_division=0)
    print(f'Performance:')
    print(f'- Accuracy: {ds_accuracy*100:.2f}')
    print(f'- F1: {ds_f1*100:.2f}')
    print(f'- Test Prediction Time: {ds_end - ds_start:.2f}s')
    print(f'- Classification Report:')
    print(classification_report(y_sentiment_dataset, ds_predictions, zero_division=0))
    print()


Testing Decision Tree Model (Rating) on Amazon Fine Food Reviews...
Performance:
- Accuracy: 53.02
- F1: 52.19
- Test Prediction Time: 0.17s
- Classification Report:
              precision    recall  f1-score   support

           1       0.25      0.27      0.26     52268
           2       0.15      0.01      0.01     29769
           3       0.18      0.12      0.15     42640
           4       0.20      0.29      0.24     80655
           5       0.71      0.71      0.71    363122

    accuracy                           0.53    568454
   macro avg       0.30      0.28      0.27    568454
weighted avg       0.52      0.53      0.52    568454



Testing Decision Tree Model (Sentiment) on Amazon Fine Food Reviews...
Performance:
- Accuracy: 76.22
- F1: 72.44
- Test Prediction Time: 0.13s
- Classification Report:
              precision    recall  f1-score   support

           1       0.36      0.27      0.31     82037
           2       0.23      0.02      0.04     42640
          