# CLUSTERING
## TRIPADVISOR: HOTELS

*   Esteban Ariza
*   Johan Giraldo
*   Mateo Valdes

## Prerequisites

### Install python libraries

In [17]:
# %pip install bertopic
# %pip install joblib==1.1.0

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Import python libraries

In [2]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

### Import data

In [3]:
NONSUST_HOTEL_DATA_PATH = '../data/web_scraping/outputs/tripadvisor_hotels_nonsustainables_v2.csv'
NONSUST_HOTEL_DATA = pd.read_csv(NONSUST_HOTEL_DATA_PATH)
NONSUST_HOTEL_DATA['HOTEL_SUSTAINABLE'] = "NO"

SUST_HOTEL_DATA_PATH = '../data/web_scraping/outputs/tripadvisor_hotels_sustainables_v2.csv'
SUST_HOTEL_DATA = pd.read_csv(SUST_HOTEL_DATA_PATH)
SUST_HOTEL_DATA['HOTEL_SUSTAINABLE'] = "YES"

HOTEL_DATA = pd.concat([NONSUST_HOTEL_DATA, SUST_HOTEL_DATA])

## Model

### Training

Load

In [17]:
# loaded_model = BERTopic.load("BERTopic_model")

Model training

In [4]:
COMMENTS = HOTEL_DATA["REVIEW_TEXT"].values.tolist()

d_reduc_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
clustering_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(ngram_range=(3, 3), stop_words="english", min_df=10)

topic_model = BERTopic(umap_model=d_reduc_model, hdbscan_model=clustering_model, ctfidf_model=ctfidf_model, vectorizer_model=vectorizer_model)

topics, probabilities = topic_model.fit_transform(COMMENTS)

Save

In [9]:
topic_model.save("BERTopic_model")

### Testing

### Visualization

Topics

In [5]:
topic_model.get_topic_info()
topic_model.get_topics()

{-1: [('great food good', 0.04645558433557806),
  ('king sized bed', 0.0457977829050805),
  ('living room area', 0.04552633093461472),
  ('tea coffee making', 0.04529051283491324),
  ('took long time', 0.045056126303502864),
  ('great water pressure', 0.045022913572500996),
  ('amazing hotel staff', 0.04471965367316064),
  ('hotel hotel great', 0.044525630031964275),
  ('breakfast served room', 0.04449066713203643),
  ('hotel fantastic staff', 0.04436209302184452)],
 0: [('rooms sea view', 0.13474136543641815),
  ('booked sea view', 0.12641745021368517),
  ('sea view rooms', 0.12120865193793133),
  ('great view sea', 0.12017267814153422),
  ('extra sea view', 0.11907217895224052),
  ('sea view suite', 0.11691823503585663),
  ('upgraded sea view', 0.11050283368131454),
  ('pool area small', 0.1097640796471487),
  ('deluxe sea view', 0.10961790044239964),
  ('sea facing room', 0.10951652005734133)],
 1: [('place stay amsterdam', 0.1597185981041897),
  ('red light district', 0.12596283903

Clusters

In [6]:
topic_model.visualize_topics()

Sustainable vs Non Sustainable

In [13]:
CLASSES = HOTEL_DATA["HOTEL_SUSTAINABLE"].values.tolist()

topics_per_class = topic_model.topics_per_class(COMMENTS, classes=CLASSES)
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=30)