In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd 
import numpy as np
import re

##LDA stuff
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

##cleaning stuff
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim
from gensim.models import CoherenceModel

##plotting
from matplotlib import pyplot as plt
from matplotlib import colors as mcolors
import seaborn as sns

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pyprojroot.here as here

In [3]:
# TODO: modify these list if needed (eg. if you want to load only 1 csv from star3, delete other csvs in star3 list)
star3 = ['cleaned_ibis-sg-bencoolen.csv','cleaned_hotel-boss.csv','cleaned_hotel-G.csv',
           'cleaned_village-hotel-albert-court-by-far-east-hospitality.csv',
           'cleaned_holiday-inn-express-clarke-quay.csv']
star4 = ['cleaned_village-hotel-changi-by-far-east-hospitality.csv',
         'cleaned_park-regis.csv', 'cleaned_grand-mercure-sg-roxy.csv',
         'cleaned_paradox-sg-merchant-court.csv','cleaned_crowne-plaza.csv']
star5 = ['cleaned_fullerton.csv', 'cleaned_parkroyal-collection-marina-bay.csv', 'cleaned_pan-pacific.csv',
          'cleaned_mbs_total.csv', 'cleaned_swissotel-the-stamford.csv']

RAW_FOLDER = "data/processed/"

def combine_csv_to_dataframe(file_names, all_star = False, filterDate = True):
    """
    Combine multiple CSV files into a single DataFrame.

    Parameters:
    file_names (list): List of CSV file names. 
    all_star (bool): whether or not to load all the hotels (False if only want to load 1 type of hotel star). 
    filterData (bool): whether or not to remove all data dated before 2015

    Returns:
    pd.DataFrame: Combined DataFrame.
    """
    combined_df = pd.DataFrame()

    for file_name in file_names:
        file_interim_path = RAW_FOLDER + file_name
        file_path = here(file_interim_path)
        try:
            df = pd.read_csv(file_path)
            if all_star:
                if file_name in star3:
                    df["star"] = 3
                elif file_name in star4:
                    df["star"] = 4
                else:
                    df["star"] = 5
            #print(f"Length of {file_name} is {len(df)}")
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            #print(len(combined_df))
        except FileNotFoundError:
            print(f"File not found: {file_name}")
        except pd.errors.EmptyDataError:
            print(f"Empty or invalid CSV file: {file_name}")
            
    combined_df = combined_df[combined_df.year > 2000]
                    
    return combined_df

In [4]:
data = combine_csv_to_dataframe(star3+star4+star5, all_star = True, filterDate = True)
#data[['traveller_username','date','travel_type','traveller_total_contributions','traveller_total_helpful_contributions','review_title','review_text','rating']].head(5)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68292 entries, 0 to 68291
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Unnamed: 0                             68292 non-null  int64  
 1   traveller_username                     68292 non-null  object 
 2   review_title                           68253 non-null  object 
 3   review_text                            68292 non-null  object 
 4   travel_type                            31354 non-null  object 
 5   traveller_country_origin               51724 non-null  object 
 6   traveller_total_contributions          68103 non-null  object 
 7   traveller_total_helpful_contributions  54090 non-null  float64
 8   rating                                 54837 non-null  float64
 9   valid_rating                           68292 non-null  bool   
 10  label                                  54837 non-null  object 
 11  cl

Unnamed: 0.1,Unnamed: 0,traveller_username,review_title,review_text,travel_type,traveller_country_origin,traveller_total_contributions,traveller_total_helpful_contributions,rating,valid_rating,label,cleaned_review,combined_review,date,covid,year,stem_review,lem_review,star
0,0,Love_Life_Sydney,Clean and comfortable,Hotel rooms in Singapore are so expensive so t...,Trip type: Travelled as a couple,"Sydney, Australia",2302.0,871.0,4.0,True,Positive,clean comfortable hotel rooms singapore expens...,Clean and comfortable Hotel rooms in Singapore...,2023-08-01,PostCovid,2023,clean comfort hotel room singapor expens find ...,clean comfortable hotel room singapore expensi...,3
1,1,Bilal S,"Good hotel, great location",This is a great place! Location is great but t...,Trip type: Travelled with family,"Houston, Texas",4.0,,5.0,True,Positive,good hotel great location great place location...,"Good hotel, great location This is a great pl...",2023-08-01,PostCovid,2023,good hotel great locat great place locat great...,good hotel great location great place location...,3
2,2,Anthony Fernando,Good place for a decent price.,Good place good price Easy access to the city...,Trip type: Travelled with friends,"Dubai, United Arab Emirates",39.0,38.0,5.0,True,Positive,good place decent price good place good price ...,Good place for a decent price. Good place good...,2022-10-01,PostCovid,2022,good place decent price good place good price ...,good place decent price good place good price ...,3
3,3,Mjkc204,Great Location and great staff.,The IBIS was a neat and tidy hotel in line wit...,Trip type: Travelled solo,"Ellenbrook, Australia",37.0,19.0,5.0,True,Positive,great location great staff ibis neat tidy hote...,Great Location and great staff. The IBIS was a...,2023-08-01,PostCovid,2023,great locat great staff ibi neat tidi hotel li...,great location great staff ibis neat tidy hote...,3
4,4,Aung Nanda,Good for budget stay.,I stayed there for 7 days. It was a nice locat...,Trip type: Travelled on business,"Dubai, United Arab Emirates",3.0,4.0,4.0,True,Positive,good budget stay stayed days nice location sev...,Good for budget stay. I stayed there for 7 day...,2022-08-01,PostCovid,2022,good budget stay stay day nice locat seven ele...,good budget stay stay day nice location seven ...,3


## Stemmed

In [5]:
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word.lower() for word in tokens]
    return tokens

def remove_non_english_words(text, valid_words):
    tokens = word_tokenize(text)
    ans = [w for w in tokens if w.lower() in valid_words]
    return ' '.join(ans)

def remove_stopwords(text):
    tokens = word_tokenize(text)
    ans = [w for w in tokens if w.lower() not in stop_words]
    return ' '.join(ans)

In [6]:
## filter for 5 star hotels and year >= 2015
df_filtered = data.query('star==5 & year>= 2015')
df_filtered.shape

(35622, 19)

In [7]:
##preprocess text
nltk.download('stopwords')
nltk.download('words')
# Get the list of valid English words
english_words = set(words.words())
# set stopwords
sw = stopwords.words('english')
sw.append('fullerton')
sw.append('parkroyal')
sw.append('marina_bay')
sw.append('marina')
sw.append('swissotel')
sw.append('stamford')
sw.append('pan_pacific')
stop_words = set(sw)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ammarbagharib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/ammarbagharib/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [8]:
df_filtered['stem_review'] = df_filtered['stem_review'].apply(remove_stopwords)
df_filtered['stem_review'] = df_filtered['stem_review'].apply(remove_non_english_words, valid_words=english_words)
df_filtered['tokens'] = df_filtered['stem_review'].apply(preprocess)

In [9]:
df_precovid = df_filtered[df_filtered['covid'] == 'PreCovid']
df_postcovid = df_filtered[df_filtered['covid'] == 'PostCovid']

In [11]:
##Generate LDA dictionary and corpus
pre_dict = corpora.Dictionary(df_precovid['tokens'])
pre_corpus = [pre_dict.doc2bow(text) for text in df_precovid['tokens']]

##Generate LDA dictionary and corpus
post_dict = corpora.Dictionary(df_postcovid['tokens'])
post_corpus = [post_dict.doc2bow(text) for text in df_postcovid['tokens']]

In [14]:
lda = gensim.models.ldamodel.LdaModel

## Use 5 topics for pre covid corpus
pre_ldamodel = lda(pre_corpus, num_topics=5, id2word=pre_dict, passes=15)

## Use 5 topics for post covid corpus
post_ldamodel = lda(post_corpus, num_topics=5, id2word=pre_dict, passes=15)

In [15]:
# For Pre-COVID
topics_pre = pre_ldamodel.show_topics(formatted=False, num_words=20)
data_flat_pre = [word for word_list in df_precovid['tokens'] for word in word_list]

# For Post-COVID
topics_post = post_ldamodel.show_topics(formatted=False)
data_flat_post = [word for word_list in df_postcovid['tokens'] for word in word_list]

In [16]:
topics_pre

[(0,
  [('tea', 0.047321737),
   ('food', 0.046361443),
   ('hall', 0.03967458),
   ('afternoon', 0.03323972),
   ('swiss', 0.026675474),
   ('good', 0.021156957),
   ('buffet', 0.017647475),
   ('drink', 0.017447004),
   ('dinner', 0.014735601),
   ('free', 0.012463615),
   ('sky', 0.012372167),
   ('high', 0.01226281),
   ('select', 0.011947904),
   ('guess', 0.011268003),
   ('fresh', 0.010803579),
   ('enjoy', 0.010492958),
   ('fruit', 0.009401267),
   ('birthday', 0.009131017),
   ('music', 0.009060685),
   ('cake', 0.008373783)]),
 (1,
  [('room', 0.044304807),
   ('pool', 0.037158888),
   ('view', 0.034097027),
   ('hotel', 0.031437326),
   ('stay', 0.022299923),
   ('floor', 0.018123873),
   ('night', 0.01707556),
   ('one', 0.013525485),
   ('bay', 0.013002982),
   ('get', 0.010857099),
   ('th', 0.008548654),
   ('bed', 0.007822514),
   ('go', 0.00753094),
   ('like', 0.0073955324),
   ('worth', 0.007293336),
   ('bathroom', 0.007059812),
   ('garden', 0.0070390836),
   ('sa

# Dependency Parsing

In [17]:
import spacy

In [18]:
# Pass in LDA topics output to remove overlap words by choosing highest prob
def restruct_topics(topics): 
    word_prob_dict = {}
    for i in range(len(topics)):
        topic_num = topics[i][0]
        for word, prob in topics[i][1]:
            if word in word_prob_dict:
                word_prob_dict[word].append((topic_num, prob))
            else:
                ls = [(topic_num, prob)]
                word_prob_dict[word] = ls
    new_dict = {}
    for word in word_prob_dict:
        topic, highest_prob = max(word_prob_dict[word], key = lambda x: x[1])
        if topic in new_dict:
            new_dict[topic].append(word)
        else:
            ls = [word]
            new_dict[topic] = ls
    return new_dict

In [19]:
new_dict = restruct_topics(topics_pre)
for x in new_dict:
    print(f'length topic {x}: {len(new_dict[x])}')

length topic 0: 19
length topic 3: 14
length topic 1: 16
length topic 2: 13
length topic 4: 11


In [20]:
nlp = spacy.load("en_core_web_sm")

In [21]:
#takes in the dataframe and appends to each row (aspect, review) pair(s)
def dep_parse(data, new_dict):
    data['aspect_sentiment'] = np.empty((len(data), 0)).tolist()
    for i in range(len(data)):
        sentence = data['combined_review'][i]
        doc = nlp(sentence)
        aspect_sentiment = []
        for word in doc:
            cond = False
            for x in new_dict.values():
                if word.text in x:
                    cond = True
                    break
            if not cond:
                continue
            if word.pos_ == 'NOUN': 
                for j in word.lefts:
                    #print(j, word, j.dep_, j.pos_)
                    if j.dep_ == 'amod' and j.pos_ == 'ADJ':
                        tup = [word, j, list(new_dict.values()).index(x)]
                        #print(tup)
                        aspect_sentiment.append(tup)
                    for k in j.lefts:
                        if k.dep_ == 'advmod':
                            #print(word, j ,k)
                            tup = [word, k.text + ' ' + j.text, list(new_dict.values()).index(x)]
                            aspect_sentiment.append(tup)
                            if (word, j) in aspect_sentiment:
                                aspect_sentiment.remove([word, j, list(new_dict.values()).index(x)])
        data['aspect_sentiment'][i] = aspect_sentiment

In [22]:
df_filtered = df_filtered.reset_index()

In [23]:
dep_parse(df_filtered, new_dict)

In [24]:
df_filtered['aspect_sentiment']

0                                                       []
1            [[hotel, Best, 1], [breakfast, delicious, 1]]
2                                        [[stay, nice, 3]]
3                                       [[desk, front, 4]]
4                                        [[time, next, 4]]
                               ...                        
35617    [[stay, recent, 3], [stay, most recent, 3], [h...
35618    [[view, sensational, 2], [floor, 40th, 2], [vi...
35619    [[view, great, 2], [room, decent, 2], [room, l...
35620                               [[hotel, Ordinary, 1]]
35621    [[floor, 57th, 2], [day, first, 4], [room, sma...
Name: aspect_sentiment, Length: 35622, dtype: object

In [117]:
df_subset = df_filtered[["traveller_username", "date", "covid", "star", "rating", "aspect_sentiment"]]
df_subset.head(3)

Unnamed: 0,traveller_username,date,covid,star,rating,aspect_sentiment
0,Carolyn H,2023-09-01,PostCovid,5,5.0,[]
1,srquarry,2023-03-01,PostCovid,5,,"[[hotel, Best, 1], [breakfast, delicious, 1]]"
2,Maria del Mar M,2023-08-01,PostCovid,5,5.0,"[[stay, nice, 3]]"


# Textblob

In [25]:
from textblob import TextBlob

In [118]:
aspect_sentiment_column = df_subset['aspect_sentiment']

## Draft Function 3

In [119]:
def calculate_aspect_count(row):
    counts = {}
    
    for aspect_sentiment in row:
        if aspect_sentiment:  # Check if the list is not empty
            topic = aspect_sentiment[2]
            
            column_name = f'topic_{topic}'
            
            # Count occurrences of each topic
            count_column_name = f'count_{column_name}'
            counts[count_column_name] = counts.get(count_column_name, 0) + 1
            #print(counts[count_column_name])
    return pd.Series({**counts})

In [120]:
row1 = [["hotel", "Great", 1],
        ["hotel", "Excellent", 1],
        ["room", "nice", 2],
        ["room", 'very nice', 2],
        ["staff", "helpful", 2],
        ["staff", 'extremely helpful', 3]]

In [121]:
calculate_aspect_count(row1)

count_topic_1    2
count_topic_2    3
count_topic_3    1
dtype: int64

## Draft Function 2

In [122]:
# Create a function to calculate TextBlob score for each aspect-adjective pair
def calculate_textblob_score_and_count(row):
    scores = {}
    counts = {}
    
    for aspect_sentiment in row:
        if aspect_sentiment:  # Check if the list is not empty
            aspect = aspect_sentiment[0]
            adjective = aspect_sentiment[1]
            topic = aspect_sentiment[2]
            
            # Concatenate aspect and adjective
            text_to_analyze = f"{aspect}, {adjective}"
            
            # Calculate TextBlob score
            score = TextBlob(text_to_analyze).sentiment.polarity
            
            
            column_name = f'topic_{topic}'
            
            # Assign the score to the corresponding topic column
            scores[column_name] = score
            
            # Count occurrences of each topic
            count_column_name = f'count_{column_name}'
            counts[count_column_name] = counts.get(count_column_name, 0) + 1
    
    return pd.Series({**scores, **counts})
    
    #return scores

In [123]:
aspect_sentiment_column[90]

[[hotel, Great, 1],
 [hotel, Excellent, 1],
 [room, nice, 2],
 [room, 'very nice', 2],
 [staff, helpful, 3],
 [staff, 'extremely helpful', 3]]

In [124]:
# test function on subset of data
aspect_sentiment_column[:10].apply(calculate_textblob_score_and_count)

Unnamed: 0,topic_1,count_topic_1,topic_3,count_topic_3,topic_4,count_topic_4,topic_2,count_topic_2,topic_0,count_topic_0
0,,,,,,,,,,
1,1.0,2.0,,,,,,,,
2,,,0.6,1.0,,,,,,
3,,,,,0.0,1.0,,,,
4,,,,,0.0,1.0,,,,
5,0.85,2.0,,,,,0.4,2.0,,
6,,,,,,,,,,
7,,,,,0.25,1.0,,,,
8,,,0.8,1.0,,,0.7,1.0,0.16,1.0
9,,,,,,,0.8,1.0,,


## Final Function

In [125]:
# Create a function to calculate mean TextBlob polarity for each topic
def calculate_mean_textblob_polarity(row):
    scores = {}
    counts = {}
    
    for aspect_sentiment in row:
        if aspect_sentiment:  # Check if the list is not empty
            aspect = aspect_sentiment[0]
            adjective = aspect_sentiment[1]
            topic = aspect_sentiment[2]
            
            # Concatenate aspect and adjective
            text_to_analyze = f"{aspect} {adjective}"
            
            # Calculate TextBlob polarity
            polarity = TextBlob(text_to_analyze).sentiment.polarity
            
            # Accumulate polarity values of each aspect within a topic
            column_name = f'topic_{topic}'
            scores[column_name] = scores.get(column_name, 0) + polarity # add polarity to existing polarity (if have)
            # if a topic occurence is more than once, scores[column_name] will be the summed polarity
            
            # Count occurrences of each aspect within a topic
            count_column_name = f'count_{column_name}'
            counts[count_column_name] = counts.get(count_column_name, 0) + 1
    
    # Calculate mean polarity for each topic
    for count_column_name, topic_count in counts.items():
        #print(counts.items())
        column_name = count_column_name.replace('count_', '')
        mean_polarity = scores[column_name] / counts[count_column_name]
        scores[column_name] = mean_polarity # replace summed polarity with mean polarity

    return pd.Series({**counts, **scores})

In [126]:
aspect_sentiment_column[:10].apply(calculate_mean_textblob_polarity)

Unnamed: 0,count_topic_1,topic_1,count_topic_3,topic_3,count_topic_4,topic_4,count_topic_2,topic_2,count_topic_0,topic_0
0,,,,,,,,,,
1,2.0,1.0,,,,,,,,
2,,,1.0,0.6,,,,,,
3,,,,,1.0,0.0,,,,
4,,,,,1.0,0.0,,,,
5,2.0,0.85,,,,,2.0,0.5,,
6,,,,,,,,,,
7,,,,,1.0,0.25,,,,
8,,,1.0,0.8,,,1.0,0.7,1.0,0.16
9,,,,,,,1.0,0.8,,


In [128]:
result_df = df_subset['aspect_sentiment'].apply(calculate_mean_textblob_polarity)

# Concatenate the result DataFrame with df_subset
df_subset_with_scores = pd.concat([df_subset, result_df], axis=1)
df_subset_with_scores

Unnamed: 0,traveller_username,date,covid,star,rating,aspect_sentiment,count_topic_1,topic_1,count_topic_3,topic_3,count_topic_4,topic_4,count_topic_2,topic_2,count_topic_0,topic_0
0,Carolyn H,2023-09-01,PostCovid,5,5.0,[],,,,,,,,,,
1,srquarry,2023-03-01,PostCovid,5,,"[[hotel, Best, 1], [breakfast, delicious, 1]]",2.0,1.00,,,,,,,,
2,Maria del Mar M,2023-08-01,PostCovid,5,5.0,"[[stay, nice, 3]]",,,1.0,0.600,,,,,,
3,MARIA DEL MAR M,2023-08-01,PostCovid,5,5.0,"[[desk, front, 4]]",,,,,1.0,0.00,,,,
4,Alanis K,2023-09-01,PostCovid,5,5.0,"[[time, next, 4]]",,,,,1.0,0.00,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35617,Bluecann,2015-01-01,PreCovid,5,2.0,"[[stay, recent, 3], [stay, most recent, 3], [h...",1.0,0.00,2.0,0.125,1.0,-0.30,,,,
35618,Rick J,2015-01-01,PreCovid,5,5.0,"[[view, sensational, 2], [floor, 40th, 2], [vi...",,,,,,,3.0,0.455556,,
35619,BabaYagaL,2015-01-01,PreCovid,5,4.0,"[[view, great, 2], [room, decent, 2], [room, l...",1.0,0.00,,,,,6.0,0.196825,,
35620,Howard J,2015-01-01,PreCovid,5,1.0,"[[hotel, Ordinary, 1]]",1.0,-0.25,,,,,,,,
