In [63]:
import pandas as pd
import numpy as np
import re

In [64]:
raw_df = pd.read_csv("2024-10-15-AccommodationItem.csv")
raw_df.head()

Unnamed: 0,id,name,typeId,star,reviewScore,reviewCount,url,description,address,lat,lng,unities,checkin,checkout,petInfo,paymentMethods
0,11938811,Khách sạn Hera Hotel,204,1.0,9.0,31,https://www.booking.com/hotel/vn/khach-san-her...,"Set in Hà Ðông, within 6.5 km of Vincom Center...","BT30 - GTC, 66 P. Tố Hữu, Hà Đông, Hà Nội, Hà ...",20.984943,105.774455,"['Airport shuttle', 'Non-smoking rooms', 'Free...",From 14:00 to 00:00,From 12:00 to 13:00,Pets are allowed. No extra charges.,['Cash']
1,12102265,Ánh Hồng Hotel,204,5.0,9.1,8,https://www.booking.com/hotel/vn/anh-duc-lai-c...,Ánh Hồng Hotel is offering accommodation in La...,"Số nhà 027 đường bờ hồ Thuỷ Sơn, phường Tân Ph...",22.39111,103.467469,"['Non-smoking rooms', 'Free parking', 'Room se...",Available 24 hours,Available 24 hours,Pets are allowed on request. Charges may be ap...,"['Visa', 'Mastercard', 'JCB', 'Cash']"
2,12164112,D Apartment,201,,10.0,5,https://www.booking.com/hotel/vn/d-apartment.e...,"Set 600 metres from Doi Duong Beach, D Apartme...","n4-01 Phan Trung, Ấp Bình Hưng, Vietnam",10.931548,108.117363,"['Free parking', 'Free WiFi', 'Family rooms', ...",From 14:30 to 23:00,From 08:00 to 12:00,Pets are not allowed.,['Cash']
3,9737330,Homestay Vân Đức Cù Lao Chàm,201,,8.9,98,https://www.booking.com/hotel/vn/homestay-van-...,"Set near Bai Lang Beach and Bai Ong Beach, Hom...","Homestay Vân Đức Cù Lao Chàm, Tân Hiệp, Vietnam",15.960278,108.505032,"['Non-smoking rooms', 'Free parking', 'Free Wi...",From 09:00 to 17:00,From 11:00 to 12:00,Pets are allowed. No extra charges.,['Cash']
4,8112394,La Casa Boutique Hotel,204,3.0,9.4,1920,https://www.booking.com/hotel/vn/la-casa-vung-...,"Situated in Vung Tau, within 1.1 km of Back Be...","22 Phan Huy Ich, Vung Tau, Vietnam",10.337734,107.082999,"['Outdoor swimming pool', 'Free parking', 'Fam...",From 14:00 to 00:00,From 01:00 to 12:00,Pets are not allowed.,"['American Express', 'Visa', 'Mastercard', 'Di..."


In [65]:
raw_df.shape

(50, 16)

In [66]:
raw_df.dtypes

id                  int64
name               object
typeId              int64
star              float64
reviewScore       float64
reviewCount         int64
url                object
description        object
address            object
lat               float64
lng               float64
unities            object
checkin            object
checkout           object
petInfo            object
paymentMethods     object
dtype: object

In [67]:
duplicate_rows = raw_df[raw_df.duplicated()]
duplicate_rows

Unnamed: 0,id,name,typeId,star,reviewScore,reviewCount,url,description,address,lat,lng,unities,checkin,checkout,petInfo,paymentMethods


In [68]:
percent_null = raw_df.isna().mean()*100
percent_null

id                 0.0
name               0.0
typeId             0.0
star              40.0
reviewScore        0.0
reviewCount        0.0
url                0.0
description        0.0
address            0.0
lat                0.0
lng                0.0
unities            0.0
checkin            0.0
checkout           0.0
petInfo            0.0
paymentMethods     0.0
dtype: float64

In [69]:
def clean_description_text(series):
    return series.str.replace(r'[^\w\s]', '', regex=True).str.strip()

In [70]:
def transform_unities_column(unities_series):
    unities_clean = unities_series.str.replace(r'[\[\]]', '', regex=True).str.strip()
    unities_list = unities_clean.str.split(',').apply(lambda x: list(set(item.strip() for item in x if pd.notnull(item))))
    unities_dummies = unities_list.str.join('|').str.get_dummies()
    
    return pd.concat([unities_series.to_frame(), unities_dummies], axis=1)

In [71]:
def split_checkin_checkout_times(checkin_series, checkout_series):
    checkin_times = checkin_series.str.extract(r'From (\d{2}:\d{2}) to (\d{2}:\d{2})')
    checkin_times.columns = ['checkin_start', 'checkin_end']

    checkout_times = checkout_series.str.extract(r'From (\d{2}:\d{2}) to (\d{2}:\d{2})')
    checkout_times.columns = ['checkout_start', 'checkout_end']
    
    return checkin_times, checkout_times

In [72]:
def identify_pet_friendly(pet_info_series):
    return ~pet_info_series.str.contains('not allowed', case=False, na=False)

In [73]:
def extract_payment_methods_list(payment_methods_series):
    payment_methods_clean = payment_methods_series.str.replace(r'[\[\]]', '', regex=True).str.strip()
    payment_methods_list = payment_methods_clean.str.split(',').apply(lambda x: list(set(item.strip() for item in x if pd.notnull(item))))
    return payment_methods_list

In [74]:
df_test = pd.DataFrame()

In [75]:
df_test['description_clean'] = clean_description_text(raw_df['description'])
df_test = transform_unities_column(raw_df['unities'])
df_test[['checkin_start', 'checkin_end']], df_test[['checkout_start', 'checkout_end']] = split_checkin_checkout_times(raw_df['checkin'], raw_df['checkout'])
df_test['pet_friendly'] = identify_pet_friendly(raw_df['petInfo'])
df_test['payment_methods_list'] = extract_payment_methods_list(raw_df['paymentMethods'])


In [78]:
df_pet = pd.DataFrame()
df_unity = pd.DataFrame()

In [77]:
df_unity['unities_clean'] = raw_df['unities'].str.replace(r'[\[\]]', '', regex=True).str.strip()
df_unity['unities_list'] = df_unity['unities_clean'].str.split(',').apply(lambda x: list(set(item.strip() for item in x)))
unities_dummies = df_unity['unities_list'].str.join('|').str.get_dummies()
df_unity = pd.concat([df_unity, unities_dummies], axis=1)
df_unity.head()

Unnamed: 0,unities,'2 restaurants','2 swimming pools','24-hour front desk','3 restaurants','Airport shuttle (free)','Airport shuttle','Bar','Beachfront','Breakfast',...,'Non-smoking rooms','Outdoor swimming pool','Private beach area','Private parking','Restaurant','Room service','Spa and wellness centre','Superb breakfast','Tea/coffee maker in all rooms','Very good breakfast'
0,"['Airport shuttle', 'Non-smoking rooms', 'Free...",0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,"['Non-smoking rooms', 'Free parking', 'Room se...",0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
2,"['Free parking', 'Free WiFi', 'Family rooms', ...",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,"['Non-smoking rooms', 'Free parking', 'Free Wi...",0,0,0,0,0,0,0,1,1,...,1,0,1,0,1,0,0,0,0,0
4,"['Outdoor swimming pool', 'Free parking', 'Fam...",0,0,0,0,0,1,0,0,0,...,1,1,0,0,0,1,0,0,0,0


In [None]:
df_pet['petInfo'] = raw_df['petInfo']
df_pet['pet_allowed'] = raw_df['petInfo'].apply(lambda x: 0 if 'not allowed' in str(x).lower() else 1)
df_pet.head()

Unnamed: 0,pet_allowed,petInfo
0,1,Pets are allowed. No extra charges.
1,1,Pets are allowed on request. Charges may be ap...
2,0,Pets are not allowed.
3,1,Pets are allowed. No extra charges.
4,0,Pets are not allowed.
