# LDA
by Esteban Ariza Acosta

## Init

In [None]:
%pip install pyldavis
%pip install pyldavis.gensim
%pip install spacy
%pip install nltk

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

from pprint import pprint

import spacy

import pickle
import re
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import matplotlib.pyplot as plt
import pandas as pd

import nltk
from nltk.corpus import stopwords

import string

In [None]:
# Download nltk dictionaries (stop words)
nltk.download('stopwords')

## First Iteration

In [None]:
df = pd.read_csv("../data/exploratory_analysis/tripadvisor_hotels_clean.csv")
reviews = df["REVIEW_TEXT"].values.tolist()
hotels = df['HOTEL_NAME'].values.tolist()

#Eliminate puntiation marks
reviews = [r.translate(str.maketrans('','',string.punctuation)) for r in reviews]

#Lower case
reviews = [r.lower() for r in reviews]

#Split words
reviews = [r.split(' ') for r in reviews]

#Blank spaces
reviews = [list(filter(lambda r: not (not r), review)) for review in reviews]

In [None]:
# Eliminate common words
sw = list(set(stopwords.words('english')))
reviews = [list(filter(lambda r: r not in sw, review)) for review in reviews]

In [None]:
id2word = Dictionary(reviews)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in reviews]
print(corpus[:2])

In [None]:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

In [None]:
CLUSTERS = 6
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=CLUSTERS,
                   random_state=0,
                   chunksize=100,
                   alpha='auto',
                   per_word_topics=True)

# pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
class_data = []
for i in range(len(doc_lda)):
    act_class_data = [hotels[i]]
    topics = doc_lda[i][0]
    for topic in topics:
        act_class_data.append(topic[1])
    class_data.append(act_class_data)

cdf_columns =[str(i+1) for i in range(CLUSTERS)]
cdf_columns.insert(0, "HOTEL")
cdf = pd.DataFrame(data=class_data, columns=cdf_columns)

In [None]:
cdf["CLUSTER"] = cdf["1"] + cdf["2"] + cdf["3"] + cdf["4"] + cdf["5"] + cdf["6"]
doc_lda[2][0]

In [None]:
gcdf = cdf.groupby('HOTEL').mean()
gcdf.to_csv(f"lda_prob_{CLUSTERS}.csv")

In [None]:
# Import hotel cluster
def cluster_map(row):
    values = [i for i in row[1:]]
    max_value = max(values)
    return values.index(max_value)+1


gcdf = pd.read_csv("lda/lda_prob_6.csv")

gcdf["CLUSTER"] = gcdf.apply(cluster_map, axis=1)

gcdf.sort_values(by="CLUSTER")

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=reviews, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
prepare_data = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds') #R=60
# html = pyLDAvis.prepared_data_to_html(prepare_data)
pyLDAvis.display(prepare_data)

In [None]:
#write string to file
text_file = open(f"lda_{str(CLUSTERS)}.html", "w")
text_file.write(html)
text_file.close()

### Examples

In [None]:
sum_df = pd.read_csv("../data/review_summarizer/summarized_reviews_by_year_and_hotel-small.csv");
in_df = pd.read_csv("../data/exploratory_analysis/tripadvisor_hotels_clean.csv")

In [None]:
REVIEW_CONCATCHAR1 = "\n"

def fromDateToYear(value): #Clean CSV (yyyy-mm-dd)
    return value.split("-")[0]

def concatReviewsByYearAndHotel(df):
    df = df.copy()
    df["REVIEW_DATE"] = df["REVIEW_DATE"].map(fromDateToYear).astype(int)
    df['REVIEW_TEXT'] = df[['HOTEL_NAME','REVIEW_TEXT','REVIEW_DATE']].groupby(["HOTEL_NAME","REVIEW_DATE"])["REVIEW_TEXT"].transform(lambda x: REVIEW_CONCATCHAR1.join(x))
    return df[['HOTEL_NAME','REVIEW_DATE','REVIEW_TEXT']].drop_duplicates()

# Group by NAME and YEAR
iny_df = concatReviewsByYearAndHotel(in_df.dropna())

In [None]:
sum_df

In [None]:
sum_sample = sum_df.sample(1).iloc[0]

sum_in_sample = iny_df[(iny_df["REVIEW_DATE"]==sum_sample["REVIEW_DATE"]) & (iny_df["HOTEL_NAME"]==sum_sample["HOTEL_NAME"])].iloc[0]

print("IN:")
print(sum_in_sample["REVIEW_TEXT"])
print("OUT:")
print(sum_sample["REVIEW_SUMMARY"])

## Second Iteration

In [None]:
#Import datasets [Disclaimer: Get the path to the actual file]
df_3s = pd.read_csv("../data/Iter2Lda/tripadvisor_hotels_3_clean.csv") # 3 Stars Hotels
df_5sy = pd.read_csv("../data/Iter2Lda/tripadvisor_hotels_sustainable_clean_5stars.csv") # 5 Stars Sustainable Hotels
df_5sn = pd.read_csv("../data/Iter2Lda/tripadvisor_hotels_nonsustainable_clean_5stars.csv") # 5 Stars Non-Sustainable Hotels

In [None]:
def cleaned_reviews(df):
    # Convert reviews to list
    reviews = df["REVIEW_TEXT"].values.tolist()

    #Eliminate puntiation marks
    reviews = [r.translate(str.maketrans('','',string.punctuation)) for r in reviews]

    # Remove numbers
    reviews = [''.join([i for i in r if not i.isdigit()]) for r in reviews]

    #Lower case
    reviews = [r.lower() for r in reviews]

    #Split words
    reviews = [r.split(' ') for r in reviews]

    #Blank spaces
    reviews = [list(filter(lambda r: not (not r), review)) for review in reviews]

    # Eliminate common words (English stop words)
    sw = list(set(stopwords.words('english'))) # English Stop Words 
    hw = ["hotel", "hotels", "here", "there", "also", "big", "close", 
        "far", "small", "well", "good", "never", "ever", "bit", "next", 
        "little", "many", "much", "minute", "minutes", "hours", "right", 
        "with", "within", "lot", "lots", "around", "me", "us", "we", 
        "front", "back", "stay", "went", "go", "got", "would", "should", 
        "could", "follow", "arrive", "see", "check","one","two","three",
        "four","five","six","seven"] # Hotel words
    cw = sw + hw
    reviews = [list(filter(lambda r: r not in cw, review)) for review in reviews]

    return reviews

In [None]:
CLUSTERS = [3,4,6,8]
def create_lda_models(reviews):
    id2word = Dictionary(reviews)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in reviews]

    # Build LDA models (~3m 30s)
    lda_models = [LdaModel(corpus=corpus,
                    id2word=id2word,
                    num_topics=c,
                    random_state=0,
                    chunksize=100,
                    alpha='auto',
                    per_word_topics=True) for c in CLUSTERS]
    
    return lda_models, corpus, id2word

In [None]:
pyLDAvis.enable_notebook()
def save_lda_visualization(lda_models, corpus, id2word, prefix:str = ''):
    prepare_data = [gensimvis.prepare(m, corpus, id2word, mds='mmds') for m in lda_models] #R=60

    # Topic distance visualization to html
    models_htmls = [pyLDAvis.prepared_data_to_html(d) for d in prepare_data]

    # Save htmls
    for i, html in enumerate(models_htmls):
        text_file = open(f"../data/lda/lda_{prefix}_{str(CLUSTERS[i])}.html", "w")
        text_file.write(html)
        text_file.close()

Is taking 10m each dataset

### Dataset 1 (5 Stars Sustainable Hotels + 3 Stars Hotels)

In [None]:
# Concat dfs
df_d1 = pd.concat([df_3s, df_5sy])

# Clean reviews
reviews =  cleaned_reviews(df_d1)

# Create lda models
lda_models, corpus, id2word = create_lda_models(reviews)

# Save lda models as htmls
save_lda_visualization(lda_models, corpus, id2word, 'd1')

### Dataset 2 (5 Stars Non Sustainable Hotels + 3 Stars Hotels)

In [None]:
# Concat dfs
df_d2 = pd.concat([df_3s, df_5sn])

# Clean reviews
reviews =  cleaned_reviews(df_d2)

# Create lda models
lda_models, corpus, id2word = create_lda_models(reviews)

# Save lda models as htmls
save_lda_visualization(lda_models, corpus, id2word, 'd2')

### Dataset 3 (3 Stars Hotels)

In [None]:
# Clean reviews
reviews =  cleaned_reviews(df_3s)

# Create lda models
lda_models, corpus, id2word = create_lda_models(reviews)

# Save lda models as htmls
save_lda_visualization(lda_models, corpus, id2word, 'd3')