# CLUSTERING
## TRIPADVISOR: HOTELS

*   Esteban Ariza
*   Johan Giraldo
*   Mateo Valdes

## Prerequisites

### Install python libraries

In [17]:
# %pip install bertopic
# %pip install joblib==1.1.0

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Import python libraries

In [2]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

### Import data

In [5]:
NONSUST_HOTEL_DATA_PATH = '../data/qualitative_analysis/sentiment/tripadvisor_hotels_nonsustainable_sentiments.csv'
NONSUST_HOTEL_DATA = pd.read_csv(NONSUST_HOTEL_DATA_PATH)
NONSUST_HOTEL_DATA['HOTEL_SUSTAINABLE'] = 0

SUST_HOTEL_DATA_PATH = '../data/qualitative_analysis/sentiment/tripadvisor_hotels_sustainable_sentiments.csv'
SUST_HOTEL_DATA = pd.read_csv(SUST_HOTEL_DATA_PATH)
SUST_HOTEL_DATA['HOTEL_SUSTAINABLE'] = 1

# HOTEL_DATA = pd.concat([NONSUST_HOTEL_DATA, SUST_HOTEL_DATA])

## Models

### Algorithms

In [3]:
def train(comments):
    d_reduc_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
    clustering_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    vectorizer_model = CountVectorizer(ngram_range=(3, 3), stop_words="english", min_df=10)

    model = BERTopic(umap_model=d_reduc_model, hdbscan_model=clustering_model, ctfidf_model=ctfidf_model, vectorizer_model=vectorizer_model)
    topics, probabilities = model.fit_transform(comments)

    return model

### **Non Sustainable**

In [6]:
NONSUST_MODEL_FILE = "../data/clustering/BERTopic_nonsustainable_model"
NONSUST_COMMENTS = NONSUST_HOTEL_DATA["REVIEW_TEXT"].values.tolist()

#### Load

In [7]:
NONSUST_MODEL = BERTopic.load(NONSUST_MODEL_FILE)

#### Train

In [20]:
# NONSUST_MODEL = train(NONSUST_COMMENTS)

Save

In [21]:
# NONSUST_MODEL.save(NONSUST_MODEL_FILE)

### **Sustainable**

In [8]:
SUST_MODEL_FILE = "../data/clustering/BERTopic_sustainable_model"
SUST_COMMENTS = SUST_HOTEL_DATA["REVIEW_TEXT"].values.tolist()

#### Load

In [9]:
SUST_MODEL = BERTopic.load(SUST_MODEL_FILE)

#### Train

In [7]:
# SUST_MODEL = train(SUST_COMMENTS)

Save

In [8]:
# SUST_MODEL.save(SUST_MODEL_FILE)

## Testing

### **Non Sustainable**

In [23]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score

# x = NONSUST_HOTEL_DATA["REVIEW_TEXT"].values.tolist()
# y = NONSUST_HOTEL_DATA["HOTEL_SUSTAINABLE"].values.tolist()

# NONSUST_MODEL.get_topics()

{-1: [('great food good', 0.04645558433557806),
  ('king sized bed', 0.0457977829050805),
  ('living room area', 0.04552633093461472),
  ('tea coffee making', 0.04529051283491324),
  ('took long time', 0.045056126303502864),
  ('great water pressure', 0.045022913572500996),
  ('amazing hotel staff', 0.04471965367316064),
  ('hotel hotel great', 0.044525630031964275),
  ('breakfast served room', 0.04449066713203643),
  ('hotel fantastic staff', 0.04436209302184452)],
 0: [('rooms sea view', 0.13474136543641815),
  ('booked sea view', 0.12641745021368517),
  ('sea view rooms', 0.12120865193793133),
  ('great view sea', 0.12017267814153422),
  ('extra sea view', 0.11907217895224052),
  ('sea view suite', 0.11691823503585663),
  ('upgraded sea view', 0.11050283368131454),
  ('pool area small', 0.1097640796471487),
  ('deluxe sea view', 0.10961790044239964),
  ('sea facing room', 0.10951652005734133)],
 1: [('place stay amsterdam', 0.1597185981041897),
  ('red light district', 0.12596283903

### **Sustainable**

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score

# x = SUST_HOTEL_DATA["REVIEW_TEXT"].values.tolist()
# y = SUST_HOTEL_DATA["HOTEL_SUSTAINABLE"].values.tolist()

# SUST_MODEL.get_topics()

## Visualization

### **Non Sustainable**

#### Topics

In [10]:
NONSUST_MODEL.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,28208,-1_iron ironing board_service staff friendly_g...
1,0,1631,0_rooms sea view_sea view suite_booked sea vie...
2,1,1406,1_use spa facilities_friendly helpful spa_saun...
3,2,1344,2_red light district_walk central station_cent...
4,3,1237,3_best hotel prague_prague definitely stay_pra...
...,...,...,...
395,394,16,394_room balcony overlooked_helpful walking di...
396,395,16,395_hotel thoroughly enjoyed_short walk metro_...
397,396,16,396_booked hotel nights_gluten free options_ol...
398,397,15,397_perfect location ideal_friendly location p...


In [11]:
NONSUST_MODEL.get_topics()

{-1: [('iron ironing board', 0.05248640736859577),
  ('service staff friendly', 0.05194562449436243),
  ('great location room', 0.050837107691256364),
  ('hotel breakfast good', 0.050252947657354434),
  ('good size bathroom', 0.05023800798480374),
  ('squeezed orange juice', 0.0501422494485811),
  ('coffee machine room', 0.04941852793010377),
  ('tea making facilities', 0.0492799463646053),
  ('bed super comfortable', 0.049273851954411065),
  ('hotel stayed night', 0.049213232313485775)],
 0: [('rooms sea view', 0.15004037471855736),
  ('sea view suite', 0.13620125507850578),
  ('booked sea view', 0.1359636476158772),
  ('sea view rooms', 0.12875973410378053),
  ('great view sea', 0.1254413548548328),
  ('deluxe sea view', 0.12382288613353523),
  ('plenty sun beds', 0.12323777910650115),
  ('pool area small', 0.11872892391420774),
  ('extra sea view', 0.114396554571163),
  ('pool area lovely', 0.11332313873650955)],
 1: [('use spa facilities', 0.14750410973584388),
  ('friendly helpful

#### Clusters

In [12]:
NONSUST_MODEL.visualize_topics()

#### Sentiments

In [17]:
CLASSES = NONSUST_HOTEL_DATA["HOTEL_NAME"].values.tolist()

NONSUST_TOPICS_PER_CLASS = NONSUST_MODEL.topics_per_class(NONSUST_COMMENTS, classes=CLASSES)
NONSUST_MODEL.visualize_topics_per_class(NONSUST_TOPICS_PER_CLASS, top_n_topics=30)

ValueError: All arrays must be of the same length

### **Sustainable**

#### Topics

In [None]:
SUST_MODEL.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,6689,-1_friendly helpful room_ordered room service_...
1,0,626,0_wonderful place stay_walking distance major_...
2,1,466,1_definitely stay time_hotel located heart_hot...
3,2,240,2_central train station_located city center_ho...
4,3,239,3_stayed just night_outdoor swimming pool_say ...
...,...,...,...
135,134,16,134_returned night stay_just returned night_ni...
136,135,16,135_hotel located just_hotel great hotel_rooms...
137,136,16,136_room service great_staff kind helpful_grea...
138,137,16,137_stayed nights end_good room service_room s...


In [None]:
SUST_MODEL.get_topics()

{-1: [('friendly helpful room', 0.09017874993372885),
  ('ordered room service', 0.08932188081893874),
  ('hotel great service', 0.08725135967299183),
  ('tea coffee making', 0.08664789867327345),
  ('room clean spacious', 0.08599876826464038),
  ('location friendly staff', 0.08503827710210486),
  ('friendly helpful breakfast', 0.08470818562229147),
  ('best hotels ive', 0.08457419110938696),
  ('great hotel great', 0.08355296879168095),
  ('coffee making facilities', 0.08341542092598675)],
 0: [('wonderful place stay', 0.14983445399965067),
  ('walking distance major', 0.14539802852171993),
  ('friendly helpful hotel', 0.14061843161271131),
  ('making feel welcome', 0.14041222900760642),
  ('really friendly helpful', 0.13732998222207826),
  ('stay wonderful hotel', 0.13732998222207826),
  ('best thing hotel', 0.13732998222207826),
  ('perfect walking distance', 0.13683318428313573),
  ('just spent nights', 0.13683318428313573),
  ('night stay hotel', 0.13386184492261702)],
 1: [('defi

#### Clusters

In [None]:
SUST_MODEL.visualize_topics()

#### Sentiments

In [None]:
CLASSES = SUST_HOTEL_DATA["HOTEL_NAME"].values.tolist() # Change to sentiments

SUST_TOPICS_PER_CLASS = SUST_MODEL.topics_per_class(SUST_COMMENTS, classes=CLASSES)
SUST_MODEL.visualize_topics_per_class(SUST_TOPICS_PER_CLASS, top_n_topics=30)