In [1]:
import pandas as pd
import numpy as np

# Load Files

In [6]:
# TODO: modify these list if needed (eg. if you want to load only 1 csv from star3, delete other csvs in star3 list)
star3 = ['cleaned_ibis-sg-bencoolen.csv','cleaned_hotel-boss.csv','cleaned_hotel-G.csv',
           'cleaned_village-hotel-albert-court-by-far-east-hospitality.csv',
           'cleaned_holiday-inn-express-clarke-quay.csv']
star4 = ['cleaned_village-hotel-changi-by-far-east-hospitality.csv',
         'cleaned_park-regis.csv', 'cleaned_grand-mercure-sg-roxy.csv',
         'cleaned_paradox-sg-merchant-court.csv','cleaned_crowne-plaza.csv']
star5 = ['cleaned_fullerton.csv', 'cleaned_parkroyal-collection-marina-bay.csv', 'cleaned_pan-pacific.csv',
          'cleaned_mbs_total.csv', 'cleaned_swissotel-the-stamford.csv']

RAW_FOLDER = "../../../data/processed/"

def combine_csv_to_dataframe(file_names, all_star = False, filterDate = True):
    """
    Combine multiple CSV files into a single DataFrame.

    Parameters:
    file_names (list): List of CSV file names. 
    all_star (bool): whether or not to load all the hotels (False if only want to load 1 type of hotel star). 
    filterData (bool): whether or not to remove all data dated before 2015

    Returns:
    pd.DataFrame: Combined DataFrame.
    """
    combined_df = pd.DataFrame()

    for file_name in file_names:
        file_interim_path = RAW_FOLDER + file_name
        file_path = file_interim_path
        try:
            df = pd.read_csv(file_path)
            if all_star:
                if file_name in star3:
                    df["star"] = 3
                elif file_name in star4:
                    df["star"] = 4
                else:
                    df["star"] = 5
            #print(f"Length of {file_name} is {len(df)}")
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            #print(len(combined_df))
        except FileNotFoundError:
            print(f"File not found: {file_name}")
        except pd.errors.EmptyDataError:
            print(f"Empty or invalid CSV file: {file_name}")
            
    combined_df = combined_df[combined_df.year > 2000]
                    
    return combined_df

In [7]:
data = combine_csv_to_dataframe(star3+star4+star5, all_star = True, filterDate = True)
#data[['traveller_username','date','travel_type','traveller_total_contributions','traveller_total_helpful_contributions','review_title','review_text','rating']].head(5)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68292 entries, 0 to 68291
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Unnamed: 0                             68292 non-null  int64  
 1   traveller_username                     68292 non-null  object 
 2   review_title                           68253 non-null  object 
 3   review_text                            68292 non-null  object 
 4   travel_type                            31354 non-null  object 
 5   traveller_country_origin               51724 non-null  object 
 6   traveller_total_contributions          68103 non-null  object 
 7   traveller_total_helpful_contributions  54090 non-null  float64
 8   rating                                 54837 non-null  float64
 9   valid_rating                           68292 non-null  bool   
 10  label                                  54837 non-null  object 
 11  cl

Unnamed: 0.1,Unnamed: 0,traveller_username,review_title,review_text,travel_type,traveller_country_origin,traveller_total_contributions,traveller_total_helpful_contributions,rating,valid_rating,label,cleaned_review,combined_review,date,covid,year,stem_review,lem_review,star
0,0,Love_Life_Sydney,Clean and comfortable,Hotel rooms in Singapore are so expensive so t...,Trip type: Travelled as a couple,"Sydney, Australia",2302.0,871.0,4.0,True,Positive,clean comfortable hotel rooms singapore expens...,Clean and comfortable Hotel rooms in Singapore...,2023-08-01,PostCovid,2023,clean comfort hotel room singapor expens find ...,clean comfortable hotel room singapore expensi...,3
1,1,Bilal S,"Good hotel, great location",This is a great place! Location is great but t...,Trip type: Travelled with family,"Houston, Texas",4.0,,5.0,True,Positive,good hotel great location great place location...,"Good hotel, great location This is a great pl...",2023-08-01,PostCovid,2023,good hotel great locat great place locat great...,good hotel great location great place location...,3
2,2,Anthony Fernando,Good place for a decent price.,Good place good price Easy access to the city...,Trip type: Travelled with friends,"Dubai, United Arab Emirates",39.0,38.0,5.0,True,Positive,good place decent price good place good price ...,Good place for a decent price. Good place good...,2022-10-01,PostCovid,2022,good place decent price good place good price ...,good place decent price good place good price ...,3
3,3,Mjkc204,Great Location and great staff.,The IBIS was a neat and tidy hotel in line wit...,Trip type: Travelled solo,"Ellenbrook, Australia",37.0,19.0,5.0,True,Positive,great location great staff ibis neat tidy hote...,Great Location and great staff. The IBIS was a...,2023-08-01,PostCovid,2023,great locat great staff ibi neat tidi hotel li...,great location great staff ibis neat tidy hote...,3
4,4,Aung Nanda,Good for budget stay.,I stayed there for 7 days. It was a nice locat...,Trip type: Travelled on business,"Dubai, United Arab Emirates",3.0,4.0,4.0,True,Positive,good budget stay stayed days nice location sev...,Good for budget stay. I stayed there for 7 day...,2022-08-01,PostCovid,2022,good budget stay stay day nice locat seven ele...,good budget stay stay day nice location seven ...,3


In [8]:
data.year.value_counts()

year
2017    12243
2016    11747
2018    10700
2019    10537
2015    10111
2022     4126
2023     3719
2021     2675
2020     2434
Name: count, dtype: int64

In [9]:
data.star.value_counts()

star
5    35622
4    18600
3    14070
Name: count, dtype: int64

# Pre-Covid

In [6]:
precovid = data[data.covid == "PreCovid"]
precovid_3star = precovid[precovid.star==3]
precovid_4star = precovid[precovid.star==4]
precovid_5star = precovid[precovid.star==5]

# Post-Covid

In [8]:
postcovid = data[data.covid == "PostCovid"]
postcovid_3star = postcovid[postcovid.star==3]
postcovid_4star = postcovid[postcovid.star==4]
postcovid_5star = postcovid[postcovid.star==5]

## Just Star

In [13]:
three_star = data[data.star == 3]
four_star = data[data.star == 4]
five_star = data[data.star == 5]

# pyABSA

In [10]:
from pyabsa import (
    ATEPCCheckpointManager,
    AspectTermExtraction as ATEPC,
    DeviceTypeOption,
    available_checkpoints,
)
from pyabsa import TaskCodeOption

import re 
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings("ignore")

No CUDA GPU found in your device
[2023-11-12 21:40:22] (2.3.4) [31mPyABSA(2.3.4): If your code crashes on Colab, please use the GPU runtime. Then run "pip install pyabsa[dev] -U" and restart the kernel.
Or if it does not work, you can use v1.x versions, e.g., pip install pyabsa<2.0 -U




Try to downgrade transformers<=4.29.0.



[0m


  _warn(f"unclosed running multiprocessing pool {self!r}",


## Define Checkpoint

In [11]:
checkpoint_map = available_checkpoints(
    TaskCodeOption.Aspect_Polarity_Classification, show_ckpts=True
)

## Define aspect extractor

In [12]:
aspect_extractor = ATEPC.AspectExtractor('english', auto_device=DeviceTypeOption.AUTO)

[2023-11-12 21:40:39] (2.3.4) [32mDownloading checkpoint:english [0m
[2023-11-12 21:40:39] (2.3.4) [31mNotice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets[0m


Downloading checkpoint: 579MB [02:42,  3.56MB/s]                               

Find zipped checkpoint: ./checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip, unzipping





Done.
[2023-11-12 21:43:27] (2.3.4) [33mIf the auto-downloading failed, please download it via browser: https://huggingface.co/spaces/yangheng/PyABSA/resolve/main/checkpoints/English/ATEPC/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip [0m
[2023-11-12 21:43:27] (2.3.4) Load aspect extractor from checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43
[2023-11-12 21:43:27] (2.3.4) config: checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43\fast_lcf_atepc.config
[2023-11-12 21:43:27] (2.3.4) state_dict: checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43\fast_lcf_atepc.state_dict
[2023-11-12 21:43:27] (2.3.4) model: None
[2023-11-12 21:43:27] (2.3.4) tokenizer: checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43\fast_lcf_atepc.tokenizer
[2023-11-12 21:43:27] (2.3.4) Set Model 

# Batch Extraction

## 3-star

In [15]:
atepc_result = aspect_extractor.batch_predict(
    list(three_star["stem_review"]),  #
    save_result=True,
    print_result=True,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=128,
)

preparing ate inference dataloader: 100%|█| 14070/14070 [01:07<00:00, 208.22it/
extracting aspect terms:   1%|▏             | 1/110 [01:51<3:23:17, 111.90s/it]


KeyboardInterrupt: 

## 4-star

In [None]:
atepc_result = aspect_extractor.batch_predict(
    list(four_star["stem_review"]),  #
    save_result=True,
    print_result=True,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=32,
)

## 5-star

In [None]:
atepc_result = aspect_extractor.batch_predict(
    list(five_star["stem_review"]),  #
    save_result=True,
    print_result=True,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=32,
)

## Pre-Covid 3 Star

In [17]:
atepc_result = aspect_extractor.batch_predict(
    list(precovid_3star["lem_review"]),  #
    save_result=True,
    print_result=True,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=32,
)

preparing ate inference dataloader: 100%|██████████| 12181/12181 [00:18<00:00, 654.85it/s]
extracting aspect terms: 100%|██████████| 381/381 [37:28<00:00,  5.90s/it]
preparing apc inference dataloader: 100%|██████████| 39690/39690 [00:58<00:00, 682.18it/s]
classifying aspect sentiments: 100%|██████████| 1241/1241 [1:39:47<00:00,  4.83s/it]


[2023-11-09 04:46:35] (2.3.4) The results of aspect term extraction have been saved in /Users/ammarbagharib/git/sentiment_analysis_bt4222/codes/aspect_modelling/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [20]:
atepc_result[1]

{'sentence': 'smart clean good location value money second stay ibis bencoolen accor gold member allow early checkin late checkout pm welcome drink breakfast really nice great variety recommend business budget traveller',
 'IOB': ['O',
  'O',
  'O',
  'B-ASP',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ASP',
  'I-ASP',
  'I-ASP',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'tokens': ['smart',
  'clean',
  'good',
  'location',
  'value',
  'money',
  'second',
  'stay',
  'ibis',
  'bencoolen',
  'accor',
  'gold',
  'member',
  'allow',
  'early',
  'checkin',
  'late',
  'checkout',
  'pm',
  'welcome',
  'drink',
  'breakfast',
  'really',
  'nice',
  'great',
  'variety',
  'recommend',
  'business',
  'budget',
  'traveller'],
 'aspect': ['location', 'welcome drink breakfast'],
 'position': [[3], [19, 20, 21]],
 'sentiment': ['Positive', 'Positive'],
 'probs': [[0.0002267445088364184, 0.00642125774174

## Pre-Covid 4 Star

In [35]:
atepc_result2 = aspect_extractor.batch_predict(
    list(precovid_4star["lem_review"]),  #
    save_result=True,
    print_result=False,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=32,
)

preparing ate inference dataloader: 100%|██████████| 16583/16583 [00:14<00:00, 1155.44it/s]
extracting aspect terms: 100%|██████████| 519/519 [38:39<00:00,  4.47s/it]
preparing apc inference dataloader: 100%|██████████| 51340/51340 [01:18<00:00, 657.07it/s]
classifying aspect sentiments: 100%|██████████| 1605/1605 [2:09:58<00:00,  4.86s/it] 


[2023-11-09 07:54:44] (2.3.4) The results of aspect term extraction have been saved in /Users/ammarbagharib/git/sentiment_analysis_bt4222/codes/aspect_modelling/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json


## Pre-Covid 5 Star

In [36]:
atepc_result3 = aspect_extractor.batch_predict(
    list(precovid_5star["lem_review"]),  #
    save_result=True,
    print_result=False,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=32,
)

preparing ate inference dataloader: 100%|██████████| 27256/27256 [00:47<00:00, 567.89it/s]
extracting aspect terms: 100%|██████████| 852/852 [1:25:08<00:00,  6.00s/it]
preparing apc inference dataloader: 100%|██████████| 89944/89944 [04:07<00:00, 363.19it/s]
classifying aspect sentiments: 100%|██████████| 2811/2811 [4:36:08<00:00,  5.89s/it]  


[2023-11-09 18:40:08] (2.3.4) The results of aspect term extraction have been saved in /Users/ammarbagharib/git/sentiment_analysis_bt4222/codes/aspect_modelling/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json


## Post-Covid 3 Star

In [39]:
atepc_result4 = aspect_extractor.batch_predict(
    list(precovid_3star["lem_review"]),  #
    save_result=True,
    print_result=False,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=32,
)

preparing ate inference dataloader: 100%|██████████| 12181/12181 [00:10<00:00, 1179.19it/s]
extracting aspect terms: 100%|██████████| 381/381 [28:24<00:00,  4.47s/it]
preparing apc inference dataloader: 100%|██████████| 39690/39690 [00:59<00:00, 671.99it/s]
classifying aspect sentiments: 100%|██████████| 1241/1241 [1:36:25<00:00,  4.66s/it]


[2023-11-10 00:10:01] (2.3.4) The results of aspect term extraction have been saved in /Users/ammarbagharib/git/sentiment_analysis_bt4222/codes/aspect_modelling/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json


## Post-Covid 4 Star

In [38]:
atepc_result5 = aspect_extractor.batch_predict(
    list(postcovid_4star["lem_review"]),  #
    save_result=True,
    print_result=False,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=32,
)

preparing ate inference dataloader: 100%|██████████| 1344/1344 [00:01<00:00, 1212.32it/s]
extracting aspect terms: 100%|██████████| 42/42 [02:57<00:00,  4.24s/it]
preparing apc inference dataloader: 100%|██████████| 4040/4040 [00:05<00:00, 770.28it/s]
classifying aspect sentiments: 100%|██████████| 127/127 [11:03<00:00,  5.23s/it]

[2023-11-09 21:57:35] (2.3.4) The results of aspect term extraction have been saved in /Users/ammarbagharib/git/sentiment_analysis_bt4222/codes/aspect_modelling/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json





## Post-Covid 5 Star

In [37]:
atepc_result5 = aspect_extractor.batch_predict(
    list(postcovid_5star["lem_review"]),  #
    save_result=True,
    print_result=False,  # print the result
    pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
    eval_batch_size=32,
)

preparing ate inference dataloader: 100%|██████████| 4387/4387 [00:04<00:00, 1059.45it/s]
extracting aspect terms: 100%|██████████| 138/138 [09:38<00:00,  4.19s/it]
preparing apc inference dataloader: 100%|██████████| 13313/13313 [00:19<00:00, 700.44it/s]
classifying aspect sentiments: 100%|██████████| 417/417 [35:13<00:00,  5.07s/it]


[2023-11-09 21:04:21] (2.3.4) The results of aspect term extraction have been saved in /Users/ammarbagharib/git/sentiment_analysis_bt4222/codes/aspect_modelling/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
