In [1]:
import numpy as np
import pandas as pd
import spacy
from datetime import datetime

import en_nlp_utils

In [2]:
# Define default source path
SRC_PATH = "src/"

start_time = datetime.now()
print("Text processing started at {}".format(start_time))

Text processing started at 2024-06-27 20:50:08.074383


In [3]:
# Load dataset
df_review = pd.read_csv(SRC_PATH + "en_hotel_review.csv")

#target_text = "review_cleaned_v1"
#target_text = "review_cleaned_v2"
target_text = "review_cleaned_v3"

# Remove rows when target_text is NULL
df_review = df_review.dropna(subset=[target_text]).reset_index(drop=True)

In [4]:
# Initializes spaCy's English model
nlp = spacy.load("en_core_web_sm")

In [5]:
# Function for part-of-speech tagging and merging
def pos_tag_and_merge(text):
    
    doc = nlp(text)
    pos_tags = {
        "ADJ": [],
        "ADV": [],
        "NOUN": [],
        "VERB": [],
        "PROPN": [],
        "DET": [],
        "NUM": [],
    }

    # Add words to the corresponding list by POS
    for token in doc:
        pos_tag = token.pos_ # get POS tagging
        lemma = token.lemma_ # get lexical element (morphing)
        if pos_tag in pos_tags:
            pos_tags[pos_tag].append(lemma)
    
    # Merges the terms in the list into strings, or None if empty
    pos_tags_merged = {pos: " ".join(words) if words else None for pos, words in pos_tags.items()}
    return pos_tags_merged

In [6]:
# Apply POS tagging and merging to each review
pos_tags_list = []
for review in df_review[target_text]:
    pos_tags_list.append(pos_tag_and_merge(review))

# Convert list of dictionaries to DataFrame
pos_tags_df = pd.DataFrame(pos_tags_list)

# Insert POS tags into original dataset df_review
df_review = pd.concat([df_review, pos_tags_df], axis=1)

In [7]:
df_review.head(2)

Unnamed: 0,source,hotel_id,hotel_name,country,group_name,room_type,stay_length,stay_date,review_score,review_score_category,...,review_cleaned_v1,review_cleaned_v2,review_cleaned_v3,ADJ,ADV,NOUN,VERB,PROPN,DET,NUM
0,Agoda,1007,"PARKROYAL on Kitchener Road, Singapore",India,Couple,Double Room,6,01/01/2024,9.6,Exceptional,...,awesome aspect,awesome aspect,awesome aspect,,,aspect,,awesome,,
1,Agoda,1007,"PARKROYAL on Kitchener Road, Singapore",India,Couple,Double Room,3,01/01/2024,9.6,Exceptional,...,everything good except many restaurant hotel c...,everything good except many restaurant charge ...,everything except many restaurant charge cutle...,many little unusual,,restaurant charge cutlery rest thing,find,,,


In [8]:
df_review.to_csv(SRC_PATH + "en_hotel_review.csv", index=False, encoding="utf-8")

In [9]:
end_time = datetime.now()
print("Text processing ended at {}".format(end_time))
print("Text processing spent {}".format(end_time - start_time))

Text processing ended at 2024-06-27 20:53:29.322821
Text processing spent 0:03:21.248438
