In [1]:
RANDOM_STATE = 1

# Imports

In [2]:
import pandas as pd
import time
import numpy as np
import pickle
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth',300)

from nltk.corpus import stopwords
import re
import nltk
from nltk import tokenize
from nltk.stem import WordNetLemmatizer

import math

import spacy
import os
import pyprojroot.here as here

# Import datasets from drive

Note - for the purpose of brevity, data has already been preprocessed and cleaned.

In [3]:
# TODO: modify these list if needed (eg. if you want to load only 1 csv from star3, delete other csvs in star3 list)
star3 = ['cleaned_ibis-sg-bencoolen.csv','cleaned_hotel-boss.csv','cleaned_hotel-G.csv',
           'cleaned_village-hotel-albert-court-by-far-east-hospitality.csv',
           'cleaned_holiday-inn-express-clarke-quay.csv']
star4 = ['cleaned_village-hotel-changi-by-far-east-hospitality.csv',
         'cleaned_park-regis.csv', 'cleaned_grand-mercure-sg-roxy.csv',
         'cleaned_paradox-sg-merchant-court.csv','cleaned_crowne-plaza.csv']
star5 = ['cleaned_fullerton.csv', 'cleaned_parkroyal-collection-marina-bay.csv', 'cleaned_pan-pacific.csv',
          'cleaned_mbs_total.csv', 'cleaned_swissotel-the-stamford.csv']

RAW_FOLDER = "data/cleaned/"

def combine_csv_to_dataframe(file_names, all_star = False, filterDate = True):
    """
    Combine multiple CSV files into a single DataFrame.

    Parameters:
    file_names (list): List of CSV file names. 
    all_star (bool): whether or not to load all the hotels (False if only want to load 1 type of hotel star). 
    filterData (bool): whether or not to remove all data dated before 2015

    Returns:
    pd.DataFrame: Combined DataFrame.
    """
    combined_df = pd.DataFrame()

    for file_name in file_names:
        file_interim_path = RAW_FOLDER + file_name
        file_path = here(file_interim_path)
        try:
            df = pd.read_csv(file_path)
            if all_star:
                if file_name in star3:
                    df["star"] = 3
                elif file_name in star4:
                    df["star"] = 4
                else:
                    df["star"] = 5
            #print(f"Length of {file_name} is {len(df)}")
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            #print(len(combined_df))
        except FileNotFoundError:
            print(f"File not found: {file_name}")
        except pd.errors.EmptyDataError:
            print(f"Empty or invalid CSV file: {file_name}")
            

                                
    return combined_df

In [4]:
data = combine_csv_to_dataframe(star3+star4+star5, all_star = True, filterDate = True)
#data[['traveller_username','date','travel_type','traveller_total_contributions','traveller_total_helpful_contributions','review_title','review_text','rating']].head(5)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101761 entries, 0 to 101760
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   travel_type      46064 non-null   object 
 1   rating           84847 non-null   float64
 2   valid_rating     101761 non-null  bool   
 3   label            84847 non-null   object 
 4   combined_review  101761 non-null  object 
 5   date             101054 non-null  object 
 6   covid            101054 non-null  object 
 7   is_local         101761 non-null  int64  
 8   star             101761 non-null  int64  
dtypes: bool(1), float64(1), int64(2), object(5)
memory usage: 6.3+ MB


Unnamed: 0,travel_type,rating,valid_rating,label,combined_review,date,covid,is_local,star
0,couple,4.0,True,Positive,"Clean and comfortable Hotel rooms in Singapore are so expensive so to find a decent hotel, with easy transport access and food locations, at less than S$200 was a good deal. The compact room meets your basic needs, no fancy mood lightning or lovely artworks to grace the wall. It has clean sheets...",2023-08-01,PostCovid,0,3
1,family,5.0,True,Positive,"Good hotel, great location This is a great place! Location is great but the room is very small. There is no room service available. Overall it's ok and they were kind enough to hold my luggage after check out! It is a good hotel!!",2023-08-01,PostCovid,0,3
2,friends,5.0,True,Positive,Good place for a decent price. Good place good price Easy access to the city. All walking distance. Very close to the buggies junction. Food comers around you. Also walking distance to marina bay sands Cozy rooms. Easy checking and check out. Worth for the price. Quick getaway.,2022-10-01,PostCovid,0,3
3,solo,5.0,True,Positive,"Great Location and great staff. The IBIS was a neat and tidy hotel in line with the star rating. The staff on the front desk were super helpful and friendly. The hotel itself was in a great location from Orchard Road, Little India and had some local markets just behind it with the Bugis shopping...",2023-08-01,PostCovid,0,3
4,business,4.0,True,Positive,Good for budget stay. I stayed there for 7 days. It was a nice location. Seven eleven is next door. Easy access to the downtown and there was also a night market in the neighborhood. Room was nice. However I had to wait for a long time to check-in but overall experience was good.,2022-08-01,PostCovid,0,3


In [5]:
data.columns

Index(['travel_type', 'rating', 'valid_rating', 'label', 'combined_review',
       'date', 'covid', 'is_local', 'star'],
      dtype='object')

# Clean Data

## General text preprocess (remove nonalphanum, lowercase, remove stop words)

In [6]:
# preprocess function
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
spacy_lemmatizer = spacy.load('en_core_web_sm', disable=['parser','ner'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ammarbagharib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('words')

def remove_non_english_words(text, valid_words):
    tokens = word_tokenize(text)
    ans = [w for w in tokens if w.lower() in valid_words]
    return ' '.join(ans)

# Define the valid English words
english_words = set(words.words())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ammarbagharib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/ammarbagharib/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [8]:
def preprocess(text):
    # lower text
    text = text.lower()
    # Remove newline characters
    text = text.replace('\\n',' ').replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace('\\', ' ')
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+',' ', text)
    # remove stop word
    text = text.split()
    text = ' '.join([word for word in text if word not in stop_words])

    # Apply the remove_non_english_words function
    text = remove_non_english_words(text, valid_words=english_words)
    
    return text

## Lemmatize and Stemming

In [9]:
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

In [10]:
def lem_stem(text, lem = False, stem = None):
    text = preprocess(text)
    # lemmatize
    if lem:
        text = spacy_lemmatizer(text)
        text = ' '.join([token.lemma_ for token in text])
    # Remove stop words
    if stem:
        if stem == "porter":
            stemmer = PorterStemmer()
        else: 
            stemmer = SnowballStemmer("english")
        text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

## Test

In [11]:
remove_col = ['valid_rating']
data = data.drop(columns = remove_col)

In [12]:
data["stem_review"] = data.apply(lambda row: lem_stem(row['combined_review'], lem = False, stem = "porter"), axis = 1)
data.head(2)

Unnamed: 0,travel_type,rating,label,combined_review,date,covid,is_local,star,stem_review
0,couple,4.0,Positive,"Clean and comfortable Hotel rooms in Singapore are so expensive so to find a decent hotel, with easy transport access and food locations, at less than S$200 was a good deal. The compact room meets your basic needs, no fancy mood lightning or lovely artworks to grace the wall. It has clean sheets...",2023-08-01,PostCovid,0,3,clean comfort hotel expens find decent hotel easi transport access food less good deal compact room basic need fanci mood lightn love grace wall clean small bathroom pump pack kettl glass bottl small select tea small room safe closet space quit limit board use luggag rack complimentari suffici e...
1,family,5.0,Positive,"Good hotel, great location This is a great place! Location is great but the room is very small. There is no room service available. Overall it's ok and they were kind enough to hold my luggage after check out! It is a good hotel!!",2023-08-01,PostCovid,0,3,good hotel great locat great place locat great room small room servic avail overal kind enough hold luggag check good hotel


In [13]:
data["lem_review"] = data.apply(lambda row: lem_stem(row['combined_review'], lem = True, stem = None), axis = 1)
data.head(2)

Unnamed: 0,travel_type,rating,label,combined_review,date,covid,is_local,star,stem_review,lem_review
0,couple,4.0,Positive,"Clean and comfortable Hotel rooms in Singapore are so expensive so to find a decent hotel, with easy transport access and food locations, at less than S$200 was a good deal. The compact room meets your basic needs, no fancy mood lightning or lovely artworks to grace the wall. It has clean sheets...",2023-08-01,PostCovid,0,3,clean comfort hotel expens find decent hotel easi transport access food less good deal compact room basic need fanci mood lightn love grace wall clean small bathroom pump pack kettl glass bottl small select tea small room safe closet space quit limit board use luggag rack complimentari suffici e...,clean comfortable hotel expensive find decent hotel easy transport access food less good deal compact room basic need fancy mood lightne lovely grace wall clean small bathroom pump pack kettle glass bottle small selection tea small room safe closet space quite limited board use luggage rack comp...
1,family,5.0,Positive,"Good hotel, great location This is a great place! Location is great but the room is very small. There is no room service available. Overall it's ok and they were kind enough to hold my luggage after check out! It is a good hotel!!",2023-08-01,PostCovid,0,3,good hotel great locat great place locat great room small room servic avail overal kind enough hold luggag check good hotel,good hotel great location great place location great room small room service available overall kind enough hold luggage check good hotel


# All CSVs

In [19]:
remove_col = ['valid_rating']

for csv in star3+star4+star5:
    
    data = combine_csv_to_dataframe([csv], all_star=False, filterDate=True)
    
    data["combined_review"] = data.apply(lambda row: preprocess(row["combined_review"]), axis = 1)
    
    data["stem_review"] = data.apply(lambda row: lem_stem(row['combined_review'], lem = False, stem = "porter"), axis = 1)
    
    data["lem_review"] = data.apply(lambda row: lem_stem(row['combined_review'], lem = True, stem = None), axis = 1)
    
    data["cleaned_review"] = data.apply(lambda row: preprocess(row['combined_review']), axis = 1)
    
    data = data.drop(columns = remove_col)
    
    file_output_path = here('data/processed/' + csv)
    
    data.to_csv(file_output_path, index=False)