In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd 
import numpy as np
import re

##LDA stuff
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

##cleaning stuff
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim
from gensim.models import CoherenceModel

##plotting
from matplotlib import pyplot as plt
from matplotlib import colors as mcolors
import seaborn as sns

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pyprojroot.here as here

In [3]:
# TODO: modify these list if needed (eg. if you want to load only 1 csv from star3, delete other csvs in star3 list)
star3 = ['cleaned_ibis-sg-bencoolen.csv','cleaned_hotel-boss.csv','cleaned_hotel-G.csv',
           'cleaned_village-hotel-albert-court-by-far-east-hospitality.csv',
           'cleaned_holiday-inn-express-clarke-quay.csv']
star4 = ['cleaned_village-hotel-changi-by-far-east-hospitality.csv',
         'cleaned_park-regis.csv', 'cleaned_grand-mercure-sg-roxy.csv',
         'cleaned_paradox-sg-merchant-court.csv','cleaned_crowne-plaza.csv']
star5 = ['cleaned_fullerton.csv', 'cleaned_parkroyal-collection-marina-bay.csv', 'cleaned_pan-pacific.csv',
          'cleaned_mbs_total.csv', 'cleaned_swissotel-the-stamford.csv']

RAW_FOLDER = "data/processed/"

def combine_csv_to_dataframe(file_names, all_star = False, filterDate = True):
    """
    Combine multiple CSV files into a single DataFrame.

    Parameters:
    file_names (list): List of CSV file names. 
    all_star (bool): whether or not to load all the hotels (False if only want to load 1 type of hotel star). 
    filterData (bool): whether or not to remove all data dated before 2015

    Returns:
    pd.DataFrame: Combined DataFrame.
    """
    combined_df = pd.DataFrame()

    for file_name in file_names:
        file_interim_path = RAW_FOLDER + file_name
        file_path = here(file_interim_path)
        try:
            df = pd.read_csv(file_path)
            if all_star:
                if file_name in star3:
                    df["star"] = 3
                elif file_name in star4:
                    df["star"] = 4
                else:
                    df["star"] = 5
            #print(f"Length of {file_name} is {len(df)}")
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            #print(len(combined_df))
        except FileNotFoundError:
            print(f"File not found: {file_name}")
        except pd.errors.EmptyDataError:
            print(f"Empty or invalid CSV file: {file_name}")
            
    combined_df = combined_df[combined_df.year > 2000]
                    
    return combined_df

In [4]:
data = combine_csv_to_dataframe(star3+star4+star5, all_star = True, filterDate = True)
#data[['traveller_username','date','travel_type','traveller_total_contributions','traveller_total_helpful_contributions','review_title','review_text','rating']].head(5)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68292 entries, 0 to 68291
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Unnamed: 0                             68292 non-null  int64  
 1   traveller_username                     68292 non-null  object 
 2   review_title                           68253 non-null  object 
 3   review_text                            68292 non-null  object 
 4   travel_type                            31354 non-null  object 
 5   traveller_country_origin               51724 non-null  object 
 6   traveller_total_contributions          68103 non-null  object 
 7   traveller_total_helpful_contributions  54090 non-null  float64
 8   rating                                 54837 non-null  float64
 9   valid_rating                           68292 non-null  bool   
 10  label                                  54837 non-null  object 
 11  cl

Unnamed: 0.1,Unnamed: 0,traveller_username,review_title,review_text,travel_type,traveller_country_origin,traveller_total_contributions,traveller_total_helpful_contributions,rating,valid_rating,label,cleaned_review,combined_review,date,covid,year,stem_review,lem_review,star
0,0,Love_Life_Sydney,Clean and comfortable,Hotel rooms in Singapore are so expensive so t...,Trip type: Travelled as a couple,"Sydney, Australia",2302.0,871.0,4.0,True,Positive,clean comfortable hotel rooms singapore expens...,Clean and comfortable Hotel rooms in Singapore...,2023-08-01,PostCovid,2023,clean comfort hotel room singapor expens find ...,clean comfortable hotel room singapore expensi...,3
1,1,Bilal S,"Good hotel, great location",This is a great place! Location is great but t...,Trip type: Travelled with family,"Houston, Texas",4.0,,5.0,True,Positive,good hotel great location great place location...,"Good hotel, great location This is a great pl...",2023-08-01,PostCovid,2023,good hotel great locat great place locat great...,good hotel great location great place location...,3
2,2,Anthony Fernando,Good place for a decent price.,Good place good price Easy access to the city...,Trip type: Travelled with friends,"Dubai, United Arab Emirates",39.0,38.0,5.0,True,Positive,good place decent price good place good price ...,Good place for a decent price. Good place good...,2022-10-01,PostCovid,2022,good place decent price good place good price ...,good place decent price good place good price ...,3
3,3,Mjkc204,Great Location and great staff.,The IBIS was a neat and tidy hotel in line wit...,Trip type: Travelled solo,"Ellenbrook, Australia",37.0,19.0,5.0,True,Positive,great location great staff ibis neat tidy hote...,Great Location and great staff. The IBIS was a...,2023-08-01,PostCovid,2023,great locat great staff ibi neat tidi hotel li...,great location great staff ibis neat tidy hote...,3
4,4,Aung Nanda,Good for budget stay.,I stayed there for 7 days. It was a nice locat...,Trip type: Travelled on business,"Dubai, United Arab Emirates",3.0,4.0,4.0,True,Positive,good budget stay stayed days nice location sev...,Good for budget stay. I stayed there for 7 day...,2022-08-01,PostCovid,2022,good budget stay stay day nice locat seven ele...,good budget stay stay day nice location seven ...,3


## Data Clean

In [57]:
###### SPECIFY HOTEL STAR HERE
hotel_star = 3

In [58]:
df_filtered = data.query('star==@hotel_star & year>= 2015')
df_filtered.shape

(14070, 19)

In [59]:
sw3 = stopwords.words('english')
sw3.extend(['ibis', 'boss', 'village', 'ibis_bencoolen', 'bencoolen', 'albert', 'far_east', 'east_hospitality',
           'clarke', 'quay', 'express', 'clarke_quay', 'albert_court', 'court', 'clark', 'inn'])
stop_words3 = set(sw3)

sw4 = stopwords.words('english')
sw4.extend(['regis', 'park_regis', 'changi', 'far_east', 'east_hospitality', 'village', 'grand_mercure', 'mercure', 'roxy',
          'paradox', 'merchant', 'merchant_court', 'crowne', 'crowne_plaza', 'crown', 'plaza', 'clark', 'quay'])
stop_words4 = set(sw4)

sw5 = stopwords.words('english')
sw5.extend(['fullerton', 'parkroyal', 'collection', 'marina', 'marina_bay', 'pacific', 'pan', 'pan_pacific', 'mbs', 'bay_sands', 
          'swissotel', 'stamford', 'star'])
stop_words5 = set(sw5)

In [60]:
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ammarbagharib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/ammarbagharib/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [61]:
def remove_non_english_words(text, valid_words):
    tokens = word_tokenize(text)
    ans = [w for w in tokens if w.lower() in valid_words]
    return ' '.join(ans)

In [62]:
# Assuming data['stem_review'] is your column of interest
df_filtered['processed_review'] = df_filtered['stem_review']

# Define the valid English words
english_words = set(words.words())

# Apply the remove_non_english_words function
df_filtered['processed_review'] = df_filtered['processed_review'].apply(remove_non_english_words, valid_words=english_words)

In [63]:
def remove_stopwords_based_on_rating(row):
    # Define stopwords based on star rating
    if row['star'] == 3:
        stop_words = stop_words3
    elif row['star'] == 4:
        stop_words = stop_words4
    elif row['star'] == 5:
        stop_words = stop_words5
    else:
        # No stopwords removal for other ratings
        return row['processed_review']

    # Tokenize and remove stopwords
    tokens = word_tokenize(row['stem_review'])
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [64]:
# Apply the remove stopwords function to the 'stem_review' column
df_filtered['processed_review'] = df_filtered.apply(remove_stopwords_based_on_rating, axis=1)
    
# Apply the preprocess function
df_filtered['tokens'] = df_filtered['processed_review'].apply(preprocess)

In [65]:
df_filtered = df_filtered.reset_index()

In [66]:
##Generate LDA dictionary and corpus
all_dict = corpora.Dictionary(df_filtered['tokens'])
all_corpus = [all_dict.doc2bow(text) for text in df_filtered['tokens']]

In [67]:
lda = gensim.models.ldamodel.LdaModel

## Use 5 topics for pre covid corpus
ldamodel = lda(all_corpus, num_topics=5, id2word=all_dict, passes=15)

In [68]:
# For Pre-COVID
all_topics = ldamodel.show_topics(formatted=False, num_words=20)

In [69]:
all_topics

[(0,
  [('room', 0.03250082),
   ('hotel', 0.029336259),
   ('stay', 0.015991708),
   ('check', 0.013795247),
   ('staff', 0.012114402),
   ('us', 0.00899578),
   ('time', 0.00898883),
   ('night', 0.008717379),
   ('book', 0.008550989),
   ('one', 0.008189876),
   ('get', 0.007601211),
   ('day', 0.007516771),
   ('ask', 0.0064082285),
   ('bad', 0.0063361926),
   ('could', 0.006238851),
   ('servic', 0.0060406295),
   ('arriv', 0.0060321325),
   ('would', 0.005210614),
   ('much', 0.005010655),
   ('clean', 0.0049656755)]),
 (1,
  [('hotel', 0.03768469),
   ('room', 0.025321001),
   ('locat', 0.022836022),
   ('breakfast', 0.02193409),
   ('good', 0.021007776),
   ('great', 0.020855198),
   ('stay', 0.016249657),
   ('walk', 0.015639523),
   ('pool', 0.013419937),
   ('small', 0.012486675),
   ('singapor', 0.0118088275),
   ('area', 0.009880703),
   ('staff', 0.009846561),
   ('nice', 0.009553334),
   ('night', 0.009487598),
   ('clean', 0.009465916),
   ('well', 0.00936811),
   ('wo

# Dependency Parsing

In [70]:
import spacy

In [71]:
# Pass in LDA topics output to remove overlap words by choosing highest prob
def restruct_topics(topics): 
    word_prob_dict = {}
    for i in range(len(topics)):
        topic_num = topics[i][0]
        for word, prob in topics[i][1]:
            if word in word_prob_dict:
                word_prob_dict[word].append((topic_num, prob))
            else:
                ls = [(topic_num, prob)]
                word_prob_dict[word] = ls
    new_dict = {}
    for word in word_prob_dict:
        topic, highest_prob = max(word_prob_dict[word], key = lambda x: x[1])
        if topic in new_dict:
            new_dict[topic].append(word)
        else:
            ls = [word]
            new_dict[topic] = ls
    return new_dict

In [72]:
nlp = spacy.load("en_core_web_sm")

In [73]:
#takes in the dataframe and appends to each row (aspect, review) pair(s)
def dep_parse(data, new_dict):
    data['aspect_sentiment'] = np.empty((len(data), 0)).tolist()
    for i in range(len(data)):
        sentence = data['processed_review'][i]
        doc = nlp(sentence)
        aspect_sentiment = []
        for word in doc:
            cond = False
            for x in new_dict.values():
                if word.text in x:
                    cond = True
                    break
            if not cond:
                continue
            if word.pos_ == 'NOUN': 
                for j in word.lefts:
                    #print(j, word, j.dep_, j.pos_)
                    if j.dep_ == 'amod' and j.pos_ == 'ADJ':
                        tup = [word, j, list(new_dict.values()).index(x)]
                        #print(tup)
                        aspect_sentiment.append(tup)
                    for k in j.lefts:
                        if k.dep_ == 'advmod':
                            #print(word, j ,k)
                            tup = [word, k.text + ' ' + j.text, list(new_dict.values()).index(x)]
                            aspect_sentiment.append(tup)
                            if (word, j) in aspect_sentiment:
                                aspect_sentiment.remove([word, j, list(new_dict.values()).index(x)])
        data['aspect_sentiment'][i] = aspect_sentiment

In [74]:
dep_parse(df_filtered, restruct_topics(all_topics))

In [75]:
df_subset = df_filtered[["traveller_username", "date", "covid", "star", "rating", "aspect_sentiment"]]
df_subset.head(3)

Unnamed: 0,traveller_username,date,covid,star,rating,aspect_sentiment
0,Love_Life_Sydney,2023-08-01,PostCovid,3,4.0,"[[food, decent, 1], [room, good, 0], [room, le..."
1,Bilal S,2023-08-01,PostCovid,3,5.0,"[[hotel, good, 1], [place, great, 1], [place, ..."
2,Anthony Fernando,2022-10-01,PostCovid,3,5.0,"[[place, good, 1], [place, good, 1]]"


# Textblob

In [76]:
from textblob import TextBlob

In [77]:
aspect_sentiment_column = df_subset['aspect_sentiment']

## Mean Textblob Function

In [78]:
# Create a function to calculate mean TextBlob polarity for each topic
def calculate_mean_textblob_polarity(row):
    scores = {}
    counts = {}
    
    for aspect_sentiment in row:
        if aspect_sentiment:  # Check if the list is not empty
            aspect = aspect_sentiment[0]
            adjective = aspect_sentiment[1]
            topic = aspect_sentiment[2]
            
            # Concatenate aspect and adjective
            text_to_analyze = f"{aspect} {adjective}"
            
            # Calculate TextBlob polarity
            polarity = TextBlob(text_to_analyze).sentiment.polarity
            
            # Accumulate polarity values of each aspect within a topic
            column_name = f'topic_{topic}'
            scores[column_name] = scores.get(column_name, 0) + polarity # add polarity to existing polarity (if have)
            # if a topic occurence is more than once, scores[column_name] will be the summed polarity
            
            # Count occurrences of each aspect within a topic
            count_column_name = f'count_{column_name}'
            counts[count_column_name] = counts.get(count_column_name, 0) + 1
    
    # Calculate mean polarity for each topic
    for count_column_name, topic_count in counts.items():
        #print(counts.items())
        column_name = count_column_name.replace('count_', '')
        mean_polarity = scores[column_name] / counts[count_column_name]
        scores[column_name] = mean_polarity # replace summed polarity with mean polarity

    return pd.Series({**counts, **scores})

In [79]:
result_df = df_subset['aspect_sentiment'].apply(calculate_mean_textblob_polarity)

# Concatenate the result DataFrame with df_subset
df_subset_with_scores = pd.concat([df_subset, result_df], axis=1)
df_subset_with_scores

Unnamed: 0,traveller_username,date,covid,star,rating,aspect_sentiment,count_topic_1,count_topic_0,count_topic_3,topic_1,topic_0,topic_3,count_topic_2,topic_2,count_topic_4,topic_4
0,Love_Life_Sydney,2023-08-01,PostCovid,3,4.0,"[[food, decent, 1], [room, good, 0], [room, le...",1.0,7.0,1.0,0.166667,0.138095,0.0,,,,
1,Bilal S,2023-08-01,PostCovid,3,5.0,"[[hotel, good, 1], [place, great, 1], [place, ...",4.0,2.0,,0.750000,0.275000,,,,,
2,Anthony Fernando,2022-10-01,PostCovid,3,5.0,"[[place, good, 1], [place, good, 1]]",2.0,,,0.700000,,,,,,
3,Mjkc204,2023-08-01,PostCovid,3,5.0,"[[staff, great, 1], [door, next, 0]]",1.0,1.0,,0.800000,0.000000,,,,,
4,Aung Nanda,2022-08-01,PostCovid,3,4.0,"[[door, next, 0], [time, long, 2]]",,1.0,,,0.000000,,1.0,-0.050000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14065,furam,2015-01-01,PreCovid,3,4.0,"[[pool, close, 4], [pool, swim, 4]]",,,,,,,,,2.0,0.000000
14066,Ajmal A,2015-01-01,PreCovid,3,5.0,"[[night, great, 4], [night, singapor, 4], [nig...",1.0,1.0,,0.000000,0.800000,,1.0,0.285714,3.0,0.266667
14067,Fciona,2015-01-01,PreCovid,3,4.0,"[[hotel, good, 1], [room, larger, 0], [room, a...",1.0,2.0,,0.700000,0.000000,,,,,
14068,RoySensei,2015-01-01,PreCovid,3,3.0,"[[room, unaccept, 0]]",,1.0,,,0.000000,,,,,


In [80]:
csv_name = "dp_textblob_" + str(hotel_star) + "_star.csv"
df_subset_with_scores.to_csv(csv_name)

In [81]:
# rows in whcih aspect_sentiment column is an EMPTY list
empty_aspect_rows = df_subset_with_scores[df_subset_with_scores['aspect_sentiment'].apply(lambda x: not bool(x))]
empty_aspect_rows

Unnamed: 0,traveller_username,date,covid,star,rating,aspect_sentiment,count_topic_1,count_topic_0,count_topic_3,topic_1,topic_0,topic_3,count_topic_2,topic_2,count_topic_4,topic_4
5,David G,2023-06-01,PostCovid,3,5.0,[],,,,,,,,,,
14,michael o,2023-04-01,PostCovid,3,5.0,[],,,,,,,,,,
20,Diageo,2023-04-01,PostCovid,3,4.0,[],,,,,,,,,,
31,Colleen,2023-01-01,PostCovid,3,5.0,[],,,,,,,,,,
38,Sachin B,2022-11-01,PostCovid,3,1.0,[],,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13961,RamaSeutia,2015-02-01,PreCovid,3,4.0,[],,,,,,,,,,
13968,Vicki D,2015-03-01,PreCovid,3,5.0,[],,,,,,,,,,
13985,Andreas N,2015-03-01,PreCovid,3,3.0,[],,,,,,,,,,
13987,Neel_alok,2015-03-01,PreCovid,3,4.0,[],,,,,,,,,,
