In [1]:
import pandas as pd

import spacy
from spacy_cld import LanguageDetector

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
data_path = "../data/AERA02_AptitudeAssessment_Dataset_NLP.csv"

df = pd.read_csv(data_path)

In [3]:
df

Unnamed: 0,score,title,review
0,5.0,Very good hotel,"Good hotel i have ever stayed in Vietnam, good..."
1,4.0,BUEN ALOJAMIENTO QUE GANARIA MUCHO MEJORANDO E...,Este hotel está muy cerca del barrio de las em...
2,5.0,Great place in Cau Giay,This place was very nice. Our bedroom were cle...
3,5.0,TRẢI NGHIỆM TỐT,Đầy đủ dịch vụ tiện nghi Ăn sáng buffee ngon H...
4,5.0,Perfect stay,It was a amazing hotel. They helped very good ...
...,...,...,...
1203207,5.0,乾淨整潔，交通方便,位於峴港市區，距離韓江橋或韓市場都不會太遠，店員很熱心，還可以幫忙預訂摩托車跟行程，非常值得...
1203208,5.0,Check this place,My friend and I received excellent and profess...
1203209,5.0,店员给了我们很多帮助，装修简单精致，卫生很好,这是我们此行到越南第一个入住的酒店，也是呆的时间最长的酒店。酒店原本是一家咖啡店，其次楼上有...
1203210,5.0,Công tác,Rất tuyệt vời... khi đến đây tôi cảm giác thoả...


In [4]:
df.isna().sum()

score     163856
title     163871
review    389364
dtype: int64

In [5]:
df[df.isna().any(axis=1)].T

Unnamed: 0,3000,3001,3002,3003,3004,3005,3006,3007,3008,3009,...,1179643,1179644,1179645,1179646,1179647,1185209,1189562,1190394,1198988,1201101
score,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,2.0,4.0,5.0,5.0,5.0
title,Friendly and helpful team of staff,Friendly and helpful team of staff,Friendly and helpful team of staff,Friendly and helpful team of staff,Friendly and helpful team of staff,Friendly and helpful team of staff,Friendly and helpful team of staff,Friendly and helpful team of staff,Friendly and helpful team of staff,Friendly and helpful team of staff,...,Very Clean Room With Reasonable Price,Very Clean Room With Reasonable Price,Very Clean Room With Reasonable Price,Very Clean Room With Reasonable Price,Very Clean Room With Reasonable Price,맛사지 도중에 눈가리고 맛사지사를 바꿔요- 주의하세요,,家庭旅行，很好的酒店同埋不錯的服務,Wonderful Experience,Wonderful hospitality and service
review,,,,,,,,,,,...,,,,,,,Quite good place to be stay on your vacation b...,,,


In [6]:
df.duplicated().sum()

418969

In [7]:
df[df.duplicated()]

Unnamed: 0,score,title,review
3001,5.0,Friendly and helpful team of staff,
3002,5.0,Friendly and helpful team of staff,
3003,5.0,Friendly and helpful team of staff,
3004,5.0,Friendly and helpful team of staff,
3005,5.0,Friendly and helpful team of staff,
...,...,...,...
1195651,5.0,"Good location, nice room, friendly staff!",Such a lovely stay! The staff was amazingly fr...
1195652,5.0,Wondrrful time,We stayed here for 3 nights and that was wonde...
1195653,5.0,Sofia Suite Hotel is your home away from home,Stayed at Sofia Suite Hotel for 29 nights. Whi...
1198989,no_info,no_info,


Check for unique values in the "score" column.

In [8]:
df["score"].unique()

array(['5.0', '4.0', '2.0', '1.0', '3.0', 'Singapore, Singapore',
       'no_info', 'Tauranga, New Zealand', nan, 'Worldwide',
       'Kamakura, Japan', 'San Ramon, California', 'Taipei, Taiwan',
       'Hanoi, Vietnam', 'Ashdod, Israel', 'buenos aires',
       'Seoul, South Korea', 'Chur, Switzerland', 'Esneux, Belgium',
       'Hoi An, Vietnam', 'Shiwa-cho, Japan',
       'Las Navas de la Concepcion, Spain', 'Nuth, The Netherlands',
       'Nha Trang, Vietnam', 'Corona, California', 'Moscow, Russia',
       'Mumbai, India', 'Ho Chi Minh City, Vietnam', 'Nashik, India',
       'Devonport, Tasmania, Australia', 'Gold Coast, Australia',
       'London, United Kingdom', 'Melbourne, Australia',
       'Newry, United Kingdom', 'Edmonton, Canada',
       'Alice Springs, Australia', 'Goyang, South Korea',
       'Busan, South Korea'], dtype=object)

In [9]:
invalid_values = set(df["score"].unique()) - {'1.0', '2.0', '3.0', '4.0', '5.0'}
invalid_df = df[df["score"].isin(invalid_values)].copy()
invalid_df

Unnamed: 0,score,title,review
32768,"Singapore, Singapore",no_info,
33673,no_info,no_info,
35620,no_info,no_info,
38210,no_info,no_info,
41187,no_info,no_info,
...,...,...,...
1179648,no_info,no_info,
1185210,no_info,no_info,
1190395,no_info,no_info,
1198989,no_info,no_info,


In [10]:
unique_values = []
for val in df["score"].unique().tolist():
    if val is not None:
        try:
            int_val = int(float(val))
            unique_values.append(val)
        except ValueError:
            pass

unique_values

['5.0', '4.0', '2.0', '1.0', '3.0']

In [11]:
df = df[df["score"].isin(unique_values)].copy()
df

Unnamed: 0,score,title,review
0,5.0,Very good hotel,"Good hotel i have ever stayed in Vietnam, good..."
1,4.0,BUEN ALOJAMIENTO QUE GANARIA MUCHO MEJORANDO E...,Este hotel está muy cerca del barrio de las em...
2,5.0,Great place in Cau Giay,This place was very nice. Our bedroom were cle...
3,5.0,TRẢI NGHIỆM TỐT,Đầy đủ dịch vụ tiện nghi Ăn sáng buffee ngon H...
4,5.0,Perfect stay,It was a amazing hotel. They helped very good ...
...,...,...,...
1203207,5.0,乾淨整潔，交通方便,位於峴港市區，距離韓江橋或韓市場都不會太遠，店員很熱心，還可以幫忙預訂摩托車跟行程，非常值得...
1203208,5.0,Check this place,My friend and I received excellent and profess...
1203209,5.0,店员给了我们很多帮助，装修简单精致，卫生很好,这是我们此行到越南第一个入住的酒店，也是呆的时间最长的酒店。酒店原本是一家咖啡店，其次楼上有...
1203210,5.0,Công tác,Rất tuyệt vời... khi đến đây tôi cảm giác thoả...


In [12]:
df["score"] = df["score"].astype("float16")
df

Unnamed: 0,score,title,review
0,5.0,Very good hotel,"Good hotel i have ever stayed in Vietnam, good..."
1,4.0,BUEN ALOJAMIENTO QUE GANARIA MUCHO MEJORANDO E...,Este hotel está muy cerca del barrio de las em...
2,5.0,Great place in Cau Giay,This place was very nice. Our bedroom were cle...
3,5.0,TRẢI NGHIỆM TỐT,Đầy đủ dịch vụ tiện nghi Ăn sáng buffee ngon H...
4,5.0,Perfect stay,It was a amazing hotel. They helped very good ...
...,...,...,...
1203207,5.0,乾淨整潔，交通方便,位於峴港市區，距離韓江橋或韓市場都不會太遠，店員很熱心，還可以幫忙預訂摩托車跟行程，非常值得...
1203208,5.0,Check this place,My friend and I received excellent and profess...
1203209,5.0,店员给了我们很多帮助，装修简单精致，卫生很好,这是我们此行到越南第一个入住的酒店，也是呆的时间最长的酒店。酒店原本是一家咖啡店，其次楼上有...
1203210,5.0,Công tác,Rất tuyệt vời... khi đến đây tôi cảm giác thoả...


In [13]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,score,title,review
0,5.0,Very good hotel,"Good hotel i have ever stayed in Vietnam, good..."
1,4.0,BUEN ALOJAMIENTO QUE GANARIA MUCHO MEJORANDO E...,Este hotel está muy cerca del barrio de las em...
2,5.0,Great place in Cau Giay,This place was very nice. Our bedroom were cle...
3,5.0,TRẢI NGHIỆM TỐT,Đầy đủ dịch vụ tiện nghi Ăn sáng buffee ngon H...
4,5.0,Perfect stay,It was a amazing hotel. They helped very good ...
...,...,...,...
1039187,5.0,乾淨整潔，交通方便,位於峴港市區，距離韓江橋或韓市場都不會太遠，店員很熱心，還可以幫忙預訂摩托車跟行程，非常值得...
1039188,5.0,Check this place,My friend and I received excellent and profess...
1039189,5.0,店员给了我们很多帮助，装修简单精致，卫生很好,这是我们此行到越南第一个入住的酒店，也是呆的时间最长的酒店。酒店原本是一家咖啡店，其次楼上有...
1039190,5.0,Công tác,Rất tuyệt vời... khi đến đây tôi cảm giác thoả...


In [14]:
df.dtypes

score     float16
title      object
review     object
dtype: object

In [15]:
df[df["review"].isna() & df["title"].isna()]

Unnamed: 0,score,title,review


In [16]:
import numpy as np
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)
def create_df(df):
    not_numeric_cols = df.select_dtypes(exclude=np.number)
    value_ratios = [dict(df[col].value_counts(normalize=True).mul(100).round(1).astype(str) + '%')
                    for col in not_numeric_cols.columns]
    missing_ratio = round(not_numeric_cols.isna().sum()*100/len(not_numeric_cols), 1)
    num_values = [df[col].nunique() for col in not_numeric_cols.columns]
    not_numeric_cols.reset_index(drop=True)
    my_index = ['missing_ratio', 'num_values', 'value_ratios']
    cat_col_info_df = pd.DataFrame(np.array([missing_ratio, num_values, value_ratios]),index=my_index, columns=not_numeric_cols.columns)
    return cat_col_info_df

describe_df = create_df(df)

In [17]:
describe_df

Unnamed: 0,title,review
missing_ratio,0.0,21.7
num_values,533565,783760
value_ratios,"{'The best place to relax': '4.2%', 'Friendly and helpful team of staff': '2.9%', 'SUNSET SUNATO Ở ĐÂY CHỦ YẾU LÀ ĐƯỢC CÁI VỊ TRÍ ĐẸP NGẮM HOÀNG HÔN': '2.8%', '여행': '2.8%', 'Vinpearl Condotel Beachfront': '2.3%', '3rd review - ': '2.2%', 'Very Clean Room With Reasonable Price': '1.8%', '.pecialy is me Trang Hoang The hotel very nice the staff is Very nice and polite frienly . ': '1.6%', 'Reception המלון מזמין ומארח מאד באדיבות.. ארוחת סוקר מאד עשירה. חדרים מעוצבים יפה. העובדה סאני משתדל ...","{'시설이 좀 오래 되어.. 금고랑 샤워기랑 문제가 좀 있었지만 ': '3.5%', '…': '0.0%', 'Hanoi city is very safe and also the hotel is very clean..and also the staff is kind and also ms.na is look so beautiful..and when im go with ha long bae yestesday its so very nice place..im enjoying here in vietnam': '0.0%', 'Thank you my beautiful Dinh for your warm hospitality and helpfulness, hope to meet you again next time when I visit Hanoi. I will bring my sons and go back here. I like Halong tOur and Sapa tour very much..."


## Language Detector

In [18]:
# !pip install langdetect
# !pip install polyglot

In [19]:
import spacy
from spacy_cld import LanguageDetector

from tqdm import tqdm 

from langdetect import DetectorFactory, detect
DetectorFactory.seed = 0

In [20]:
nlp = spacy.load('en_core_web_sm')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

There was bad characters that raise an utf-8 error, so I will remove all of them before feed the data to the model. To overcome this issue, I levarge the bad characters list from this github issue:
https://github.com/aboSamoor/polyglot/issues/71

In [None]:
import re
from pprint import pprint

# Define the characters to remove
RE_BAD_CHARACTERS = re.compile(r'[\x00-\x1f\x7f-\x9f]')

def remove_bad_chars(text):
    return RE_BAD_CHARACTERS.sub("", text)

# text = df["review"][25182]

# clean_text = remove_bad_chars(text)
# pprint(text)
# pprint(clean_text)

In [23]:
df.fillna("", inplace=True)

In [26]:
import string
def detect_lang(text):
    doc = nlp(remove_bad_chars(text[:200]))
    return doc._.languages[0] if doc._.languages else "unknown"

# Check if title ends with punctuation
df["title"] = df["title"].apply(lambda x: x if x.strip().endswith(tuple(string.punctuation)) else x + ".")

# Concatenate title and review
df["title2review"] = df["title"] + " " + df["review"]
df["language"] = df["title2review"].parallel_apply(detect_lang)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=64950), Label(value='0 / 64950')))…

In [45]:
df.to_csv("../data/AERA02_AptitudeAssessment_Dataset_NLP_cleaned.csv", index=False)

In [47]:
df.sample(20)

Unnamed: 0,score,title,review,title2review,language
175050,5.0,Highly recommended .,"We had a great experience at the hotel. The team was welcoming from check-in, including providing a detailed introduction of activities to try in the area, and thoughtful touches like cards to inform you of the next day's weather. Breakfast is a nice balance between flavourful local and Western dishes. Will definitely stay with them again for future trips; what they offer is really special.","Highly recommended . We had a great experience at the hotel. The team was welcoming from check-in, including providing a detailed introduction of activities to try in the area, and thoughtful touches like cards to inform you of the next day's weather. Breakfast is a nice balance between flavourful local and Western dishes. Will definitely stay with them again for future trips; what they offer is really special.",en
236074,5.0,Great stay.,This is my second time staying in this hotel and my duration is no less than 5 nights each time. The hotel is conveniently located and other places of interests and shopping mall are located nearby. All the staff are friendly and courteous. Definitely a good choice . I will choose the same hotel for my next trip to Hanoi.,Great stay. This is my second time staying in this hotel and my duration is no less than 5 nights each time. The hotel is conveniently located and other places of interests and shopping mall are located nearby. All the staff are friendly and courteous. Definitely a good choice . I will choose the same hotel for my next trip to Hanoi.,en
463136,4.0,Cũng không quá tồi.,"Chuyến đi vài ngày, có vài trục trặc tại khách sạn nhưng đã giải quyết ổn thỏa. Về mọi thứ căn bản ổn. Phục vụ tốt. Cảnh ở đây view đẹp. Có thể sẽ ở đây lần tới. Đây có thể là lựa chọn tốt cho khách du lịch","Cũng không quá tồi. Chuyến đi vài ngày, có vài trục trặc tại khách sạn nhưng đã giải quyết ổn thỏa. Về mọi thứ căn bản ổn. Phục vụ tốt. Cảnh ở đây view đẹp. Có thể sẽ ở đây lần tới. Đây có thể là lựa chọn tốt cho khách du lịch",vi
598028,5.0,Perfect place to chill in a quite environment !,The place is quite remote but this is what makes the beauty of it. The staff is super friendly. And on top of this the resort itself is extremely charming. If you want to chill couple day in quite environment and get fresh seafood (fisherman at 1km) then don't miss this place !,Perfect place to chill in a quite environment ! The place is quite remote but this is what makes the beauty of it. The staff is super friendly. And on top of this the resort itself is extremely charming. If you want to chill couple day in quite environment and get fresh seafood (fisherman at 1km) then don't miss this place !,en
52856,5.0,Simply amazing.,"My girlfriend and I stayed at the Capella Hanoi for 2 nights earlier this month. Put simply, the hotel is amazing. - Exceptional service - all the staff anticipated our needs without even asking. any request was met with a smile and going above and beyond to exceed our needs. - Great breakfast - the Vietnamese options were flavorful, tasty, and on par if not better than most of the well received restaurants/stalls we visited - Gorgeous and comfortable rooms - great attention to detail, c...","Simply amazing. My girlfriend and I stayed at the Capella Hanoi for 2 nights earlier this month. Put simply, the hotel is amazing. - Exceptional service - all the staff anticipated our needs without even asking. any request was met with a smile and going above and beyond to exceed our needs. - Great breakfast - the Vietnamese options were flavorful, tasty, and on par if not better than most of the well received restaurants/stalls we visited - Gorgeous and comfortable rooms - great attent...",en
776838,5.0,"Charming, well-located, quiet.","We were a group of 10 (five double rooms) from Europe and the US, including a family that had lived 8 years in Vietnam and knew Hoi An well. The rooms were a charming modern asian style (lots of wood), those facing the back had small balconies overlooking ""green"" spaces. Everything functioned and was clean. The staff were unusually helpful, cheerful, engaging, personable. The western breakfast was wide ranging: the vietnamese breakfast couldn't be beat in the best restaurants. The hotel...","Charming, well-located, quiet. We were a group of 10 (five double rooms) from Europe and the US, including a family that had lived 8 years in Vietnam and knew Hoi An well. The rooms were a charming modern asian style (lots of wood), those facing the back had small balconies overlooking ""green"" spaces. Everything functioned and was clean. The staff were unusually helpful, cheerful, engaging, personable. The western breakfast was wide ranging: the vietnamese breakfast couldn't be beat in t...",en
84307,5.0,Couldn't be happier with our choice.,"Extremely helpful and friendly staff! Room was very clean and the air conditioning worked well. Very affordable. The location was perfect! We were able to walk everywhere we wanted to go. The free breakfast was good, too. Only downsides are that the bathroom is rather small and the wifi wasn't reliable in our room - but to be fair we didn't ask about fixing it or attempt to fix it ourselves.","Couldn't be happier with our choice. Extremely helpful and friendly staff! Room was very clean and the air conditioning worked well. Very affordable. The location was perfect! We were able to walk everywhere we wanted to go. The free breakfast was good, too. Only downsides are that the bathroom is rather small and the wifi wasn't reliable in our room - but to be fair we didn't ask about fixing it or attempt to fix it ourselves.",en
322442,5.0,Very nice family with affordable price.,"I found Cuc Phuong Hotel by reading reviews. I had a taxi driver call the owner of Cuc Phuong Hotel for directions. He was very kind on the phone. When I arrived, since there were no one staying in single room, he let me stay there instead of dormitory. I rented a motorbike and went to the National Park. You can stay in the park but the owner Luan gives you good price and he is very helpful. If you rent a motor in the park, it is $25 but with Luan it is only $6. He took me to a bus station a...","Very nice family with affordable price. I found Cuc Phuong Hotel by reading reviews. I had a taxi driver call the owner of Cuc Phuong Hotel for directions. He was very kind on the phone. When I arrived, since there were no one staying in single room, he let me stay there instead of dormitory. I rented a motorbike and went to the National Park. You can stay in the park but the owner Luan gives you good price and he is very helpful. If you rent a motor in the park, it is $25 but with Luan it i...",en
623671,5.0,Lovely stay experience in Phu Quoc.,"best 5star property in Phu island with large swimming pool overlooking ocean, next to vinworld park so convenient to go.. +they run free shuttle to and fro to safari too. At hotel had good experience at reception by Staff Eddie. Also chef sidiquee was always ready to prepare a special vegetarian meal for us as needed indian veg meals","Lovely stay experience in Phu Quoc. best 5star property in Phu island with large swimming pool overlooking ocean, next to vinworld park so convenient to go.. +they run free shuttle to and fro to safari too. At hotel had good experience at reception by Staff Eddie. Also chef sidiquee was always ready to prepare a special vegetarian meal for us as needed indian veg meals",en
370213,5.0,Amazing oasis in a beautiful national park.,"We stayed at Sy's for 2 nights and are so glad we did. The rooms were spotless, new, spacious, and beautifully designed (wood interior, stone bathrooms). The people at the homestay were always available and eager to help us with anything we needed. The bikes that are at your disposal for free were well-maintained and many to choose among (they had the right sizes for all 3 of our kids). Great pool. Only slight negative was breakfast and coffee - not great. But there are wonderful place...","Amazing oasis in a beautiful national park. We stayed at Sy's for 2 nights and are so glad we did. The rooms were spotless, new, spacious, and beautifully designed (wood interior, stone bathrooms). The people at the homestay were always available and eager to help us with anything we needed. The bikes that are at your disposal for free were well-maintained and many to choose among (they had the right sizes for all 3 of our kids). Great pool. Only slight negative was breakfast and coffee...",en
