# Random Forest

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from functools import reduce
import re
import numpy as np

In [3]:
# Load the reviews.csv file into a Pandas dataframe
reviews_df = pd.read_csv('tripadvisor_hotel_reviews.csv')

In [4]:
# Define a function to recode the values of the Score column
def recode_score(score):
    if score in [1, 2]:
        return 1
    elif score == 3:
        return 2
    elif score in [4, 5]:
        return 3

In [5]:
# Extract the review text and the corresponding scores from the dataframe
reviews_df['Sentiment Score'] = reviews_df['Rating'].apply(recode_score)
#reviews_df['Review'] = reviews_df['Review'].apply(clean_review_info)

In [7]:
vectorizer = TfidfVectorizer(
    min_df = 5,          # Minimum document frequency (i.e. ignore all words with less than 5 occurrences)
    max_df = 0.8,        # Maximum document frequency (i.e. ignore all words that account for 80% of the corpus size)
    sublinear_tf = True, # Apply sublinear term frequency scaling
    ngram_range=(1,3)    
)

In [8]:
no_of_reviews = len(reviews_df)
sections = [int(0.8 * no_of_reviews), int(0.9 * no_of_reviews)]

reviews_train, reviews_test, reviews_val = np.split(
    ary = reviews_df["Review"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)
vectorizer.fit(reviews_train)
X_train, X_test, X_val = (
    vectorizer.transform(reviews_train),
    vectorizer.transform(reviews_test),
    vectorizer.transform(reviews_val),
)
y_rating_train, y_rating_test, y_rating_val = np.split(
    ary = reviews_df["Rating"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)
y_sentiment_train, y_sentiment_test, y_sentiment_val = np.split(
    ary = reviews_df["Sentiment Score"],             # Array to split (i.e. our DataFrame of reviews)
    indices_or_sections = sections          # Sections to split (i.e. split at 80% and 90% mark)
)

In [12]:
# Create an instance of the TfidfVectorizer class
# tfidf_vectorizer = TfidfVectorizer(
#     min_df=5, max_df=0.9,
#     ngram_range=(1,1),
#     stop_words='english',
#     use_idf=True, smooth_idf=True, sublinear_tf=True
# )

# Create an instance of the RandomForestClassifier class
rfc_rating = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rfc_rating.fit(X_train, y_rating_train)

# Use the classifier to make predictions on the testing data
rfc_rating_test_predictions = rfc_rating.predict(X_test)

# Print the classification report
print(classification_report(y_rating_test, rfc_rating_test_predictions))

              precision    recall  f1-score   support

           1       0.64      0.38      0.48       103
           2       0.18      0.01      0.03       143
           3       0.33      0.00      0.01       207
           4       0.40      0.34      0.37       569
           5       0.63      0.91      0.74      1027

    accuracy                           0.57      2049
   macro avg       0.44      0.33      0.33      2049
weighted avg       0.51      0.57      0.50      2049



In [13]:
# Create an instance of the RandomForestClassifier class
rfc_sentiment = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rfc_sentiment.fit(X_train, y_sentiment_train)

# Use the classifier to make predictions on the testing data
rfc_sentiment_predictions = rfc_sentiment.predict(X_test)

# Print the classification report
print(classification_report(y_sentiment_test, rfc_sentiment_predictions))

              precision    recall  f1-score   support

           1       0.86      0.41      0.56       246
           2       1.00      0.00      0.01       207
           3       0.83      1.00      0.90      1596

    accuracy                           0.83      2049
   macro avg       0.90      0.47      0.49      2049
weighted avg       0.85      0.83      0.77      2049



In [14]:
# Use the classifier to make predictions on the testing data
rfc_rating_val_predictions = rfc_rating.predict(X_val)

# Print the classification report
print(classification_report(y_rating_val, rfc_rating_val_predictions))

              precision    recall  f1-score   support

           1       0.68      0.46      0.55       114
           2       0.45      0.06      0.11       166
           3       0.33      0.00      0.01       204
           4       0.40      0.33      0.36       585
           5       0.62      0.92      0.74       981

    accuracy                           0.57      2050
   macro avg       0.50      0.35      0.35      2050
weighted avg       0.52      0.57      0.50      2050



In [15]:
# Use the classifier to make predictions on the testing data
rfc_sentiment_val_predictions = rfc_sentiment.predict(X_val)

# Print the classification report
print(classification_report(y_sentiment_val, rfc_sentiment_val_predictions))

              precision    recall  f1-score   support

           1       0.88      0.46      0.61       280
           2       0.00      0.00      0.00       204
           3       0.82      1.00      0.90      1566

    accuracy                           0.82      2050
   macro avg       0.57      0.49      0.50      2050
weighted avg       0.75      0.82      0.77      2050



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
