In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import TargetEncoder, RobustScaler
from sklearn.feature_selection import SelectKBest, chi2
import json

In [2]:
# Set options to display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

## Data Preprocessing

This project scraped for hotels in Japan listed in Booking.com. The scraper used the site's search engine to fetch for hotels. Search parameters have been limited to two guests and five overnights stays (from Mon. to Sat.) from July to December 2025. When getting the room price, the code implementation only takes the first room offered (best deal) with a maximum guest capacity of two. The scraped data is as follows:

In [3]:
file = 'dataset.json'
df = pd.read_json(file)

In [4]:
df

Unnamed: 0,url,title,address,check_in_date,check_out_date,review_score,review_count,popular_facilities,price
0,https://www.booking.com/hotel/jp/prince-smart-...,Prince Smart Inn Kyoto Sanjo,Kyoto,"Mon, Dec 1","Sat, Dec 6",8.70,3505,"non-smoking rooms,restaurant,free wifi,24-hour...","₱ 40,833"
1,https://www.booking.com/hotel/jp/hoteruziyapan...,Hotel Japanesque Kyoto Station ZEQUU ANNEX,Kyoto,"Mon, Oct 6","Sat, Oct 11",8.50,457,"non-smoking rooms,free wifi","₱ 58,047"
2,https://www.booking.com/hotel/jp/aquasense-amp...,AQUASENSE Hotel & Resort,Okinawa,"Mon, Dec 1","Sat, Dec 6",9.30,471,"outdoor swimming pool,non-smoking rooms,restau...","₱ 72,633"
3,https://www.booking.com/hotel/jp/ocean-view-hi...,Ocean View Hiromi SPA Hotel温泉と絶景の宿,Shizuoka,"Mon, Oct 6","Sat, Oct 11",8.40,281,"non-smoking rooms,free parking,restaurant,faci...","₱ 64,080"
4,https://www.booking.com/hotel/jp/tisan-gurando...,Chisun Grand Takayama,Gifu,"Mon, Dec 1","Sat, Dec 6",8.70,3496,"non-smoking rooms,spa,private parking,free wif...","₱ 54,276"
...,...,...,...,...,...,...,...,...,...
6522,https://www.booking.com/hotel/jp/okayama-ekima...,Okayama Ekimae Universal Hotel,Okayama,"Mon, Jul 14","Sat, Jul 19",6.20,704,"non-smoking rooms,private parking,restaurant,b...","₱ 13,117"
6523,https://www.booking.com/hotel/jp/hotelsekia.ht...,Hotel Sekia,Kumamoto,"Mon, Jul 14","Sat, Jul 19",6.90,93,"outdoor swimming pool,non-smoking rooms,free p...","₱ 30,331"
6524,https://www.booking.com/hotel/jp/route-inn-yas...,Hotel Route-Inn Yatsushiro,Kumamoto,"Mon, Aug 4","Sat, Aug 9",7.80,360,"free parking,restaurant,good breakfast","₱ 18,179"
6525,https://www.booking.com/hotel/jp/route-inn-kin...,Hotel Route Inn Kinokawa,Wakayama,"Mon, Aug 4","Sat, Aug 9",8.50,90,"non-smoking rooms,free parking,restaurant,free...","₱ 23,331"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6527 entries, 0 to 6526
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   url                 6527 non-null   object 
 1   title               6524 non-null   object 
 2   address             6524 non-null   object 
 3   check_in_date       6524 non-null   object 
 4   check_out_date      6524 non-null   object 
 5   review_score        6509 non-null   float64
 6   review_count        6509 non-null   object 
 7   popular_facilities  6527 non-null   object 
 8   price               6263 non-null   object 
dtypes: float64(1), object(8)
memory usage: 459.1+ KB


The original dataset has a shape of (6527, 9).

The columns `review-count` and `price` needs to be converted to numerical values.

In [6]:
# convert review_count
df['review_count'] = df['review_count'].str.replace(',', '')
df['review_count'] = pd.to_numeric(df['review_count'])
df['review_count'].dtype

dtype('float64')

In [7]:
# convert price
df['price'] = df['price'].str.replace('₱','').str.replace(',','')
df['price'] = pd.to_numeric(df['price'])
df['price'].dtype

dtype('float64')

Rename `address` to `prefecture` to better represent the values of the column. Also, the code implementation of the scraper is missing two prefectures, Nara and Saitama. Instances with these prefectures must be replaced to shorten the address.

The `url` column can be dropped. It was only used for verification during scraping.

In [8]:
# rename address to prefecture
df.rename(columns={'address': 'prefecture'}, inplace=True)
missing_prefectures = ['Saitama','Nara']
for prefecture in missing_prefectures:
    df.loc[df['prefecture'].str.contains(prefecture, case=False, na=False), 'prefecture'] = prefecture

In [9]:
df.drop('url', axis=1, inplace=True)

During scraping, it was found that `review_count` can be updated, which prevents the `drop_duplicates()` method from removing duplicates among instances. Matches using `title`, `check_in_date`, and `check_out_date` will be used to drop one of the copies. The copy with the highest (latest) `review_count` will be kept.

In [10]:
df.drop_duplicates(inplace=True)
df.shape

(6301, 8)

In [11]:
cols = ['title','check_in_date','check_out_date']
duplicates = df[df.duplicated(subset=cols, keep=False)]
duplicates.sort_values(by='title')

Unnamed: 0,title,prefecture,check_in_date,check_out_date,review_score,review_count,popular_facilities,price
2917,&Here TOKYO UENO,Tokyo,"Mon, Aug 18","Sat, Aug 23",9.00,758.00,"non-smoking rooms,private parking,free wifi,fa...",61618.00
5848,&Here TOKYO UENO,Tokyo,"Mon, Aug 18","Sat, Aug 23",9.00,762.00,"non-smoking rooms,private parking,free wifi,fa...",61618.00
1239,AB Hotel Igaueno,Mie,"Mon, Aug 25","Sat, Aug 30",8.30,99.00,"free parking,free wifi,very good breakfast",14932.00
2028,AB Hotel Igaueno,Mie,"Mon, Oct 20","Sat, Oct 25",8.30,99.00,"free parking,free wifi,very good breakfast",17887.00
2716,AB Hotel Igaueno,Mie,"Mon, Aug 25","Sat, Aug 30",8.30,100.00,"free parking,free wifi,very good breakfast",14932.00
...,...,...,...,...,...,...,...,...
4541,ホテルザセブン,Kumamoto,"Mon, Sep 1","Sat, Sep 6",7.10,104.00,"non-smoking rooms,private parking,restaurant,g...",14559.00
226,谷町君 HOTEL 日本橋47,Osaka,"Mon, Dec 8","Sat, Dec 13",9.00,1152.00,"free wifi,air conditioning",42909.00
3715,谷町君 HOTEL 日本橋47,Osaka,"Mon, Dec 8","Sat, Dec 13",9.00,1154.00,"free wifi,air conditioning",43224.00
3760,Ｔａｂｉｓｔ ホテル塩釜&松島,Miyagi,"Mon, Oct 20","Sat, Oct 25",8.00,473.00,"private parking,free wifi",26831.00


In [12]:
# retain instances that have the highest review_count among their duplicates
retained = duplicates.drop_duplicates(subset=cols, keep='last')
retained.sort_values(by='title')

Unnamed: 0,title,prefecture,check_in_date,check_out_date,review_score,review_count,popular_facilities,price
5848,&Here TOKYO UENO,Tokyo,"Mon, Aug 18","Sat, Aug 23",9.00,762.00,"non-smoking rooms,private parking,free wifi,fa...",61618.00
3853,AB Hotel Igaueno,Mie,"Mon, Oct 20","Sat, Oct 25",8.30,100.00,"free parking,free wifi,very good breakfast",17887.00
2716,AB Hotel Igaueno,Mie,"Mon, Aug 25","Sat, Aug 30",8.30,100.00,"free parking,free wifi,very good breakfast",14932.00
4670,AB Hotel Nara,Nara,"Mon, Aug 25","Sat, Aug 30",7.90,1548.00,"non-smoking rooms,private parking,facilities f...",21776.00
3250,APA Hotel & Resort Tokyo Bay Makuhari,Chiba,"Mon, Dec 22","Sat, Dec 27",7.90,13598.00,"outdoor swimming pool,non-smoking rooms,airpor...",30832.00
...,...,...,...,...,...,...,...,...
3601,ホテルカルチャーヴィレッジ,Hokkaido,"Mon, Oct 20","Sat, Oct 25",8.50,24.00,"non-smoking rooms,free parking,free wifi",27951.00
4541,ホテルザセブン,Kumamoto,"Mon, Sep 1","Sat, Sep 6",7.10,104.00,"non-smoking rooms,private parking,restaurant,g...",14559.00
5319,ホテルザセブン,Kumamoto,"Mon, Dec 8","Sat, Dec 13",7.10,104.00,"non-smoking rooms,restaurant,private parking,g...",22009.00
3715,谷町君 HOTEL 日本橋47,Osaka,"Mon, Dec 8","Sat, Dec 13",9.00,1154.00,"free wifi,air conditioning",43224.00


In [13]:
# overwrite the dataframe
df = df[~df.duplicated(subset=cols, keep=False)]
df = pd.concat([df, retained], ignore_index=True)
df.shape

(5932, 8)

After removing the duplicates, the instances have been reduced down to 5932.

In [14]:
# check for null values
df.isna().sum()

title                   1
prefecture              1
check_in_date           1
check_out_date          1
review_score           15
review_count           15
popular_facilities      0
price                 246
dtype: int64

In [15]:
df[df.isna().any(axis=1)]

Unnamed: 0,title,prefecture,check_in_date,check_out_date,review_score,review_count,popular_facilities,price
5,アルファベットイン那覇国際通りWEST,Okinawa,"Mon, Oct 6","Sat, Oct 11",8.70,151.00,"non-smoking rooms,free wifi",
14,ホテル ナインステイツ唐津,Saga,"Mon, Oct 6","Sat, Oct 11",9.50,13.00,"non-smoking rooms,free parking,free wifi",
36,Comfort Villa,Okinawa,"Mon, Nov 17","Sat, Nov 22",8.40,886.00,"non-smoking rooms,private parking,family rooms...",
38,Family Condo Chatan Hills by Coldio Premium,Okinawa,"Mon, Nov 17","Sat, Nov 22",8.70,732.00,"non-smoking rooms,free parking,family rooms,fr...",
79,Sakura Cross Hotel Kyoto Kiyomizu,Kyoto,"Mon, Oct 13","Sat, Oct 18",8.70,2253.00,"non-smoking rooms,free wifi,family rooms,24-ho...",
...,...,...,...,...,...,...,...,...
5623,Granbell Hotel Otaru,Hokkaido,"Mon, Nov 3","Sat, Nov 8",,,"non-smoking rooms,restaurant,private parking,f...",26182.00
5760,Hotel Kinsuien,Oita,"Mon, Aug 4","Sat, Aug 9",7.20,243.00,"non-smoking rooms,free parking,3 restaurants,f...",
5801,UNO HOTEL,Okayama,"Mon, Nov 17","Sat, Nov 22",9.00,1084.00,"non-smoking rooms,3 restaurants,free parking,f...",
5862,MIMARU Tokyo Shinjuku West,Tokyo,"Mon, Nov 3","Sat, Nov 8",8.80,1920.00,"non-smoking rooms,family rooms,free wifi,laund...",


(1) There are instances wherein all of its features are `None` such as instance #3579 (see JSON file). This kind of instances must be dropped. It is assumed that they are scraper errors.

(2) Other `None` values from numerical features specifically, `review_score`, `review_count`, and `price`, can be filled using their mean values. 

In [16]:
# (1) will be addressed by dropping null check_in and check_out dates
df.dropna(subset=['check_out_date','check_in_date'], inplace=True)
df.shape

(5931, 8)

In [17]:
# (2) will be addressed by replacing None values with median values of their respective column
numerical_features = ['review_score','review_count','price']
for feature in numerical_features:
    if feature == 'review_score':
        df.fillna({ feature: round(df[feature].mean(), 1) }, inplace=True)
    else:
        df.fillna({ feature: round(df[feature].mean()) }, inplace=True)

In [18]:
df.isna().sum()

title                 0
prefecture            0
check_in_date         0
check_out_date        0
review_score          0
review_count          0
popular_facilities    0
price                 0
dtype: int64

In [19]:
df.head()

Unnamed: 0,title,prefecture,check_in_date,check_out_date,review_score,review_count,popular_facilities,price
0,Prince Smart Inn Kyoto Sanjo,Kyoto,"Mon, Dec 1","Sat, Dec 6",8.7,3505.0,"non-smoking rooms,restaurant,free wifi,24-hour...",40833.0
1,Hotel Japanesque Kyoto Station ZEQUU ANNEX,Kyoto,"Mon, Oct 6","Sat, Oct 11",8.5,457.0,"non-smoking rooms,free wifi",58047.0
2,Ocean View Hiromi SPA Hotel温泉と絶景の宿,Shizuoka,"Mon, Oct 6","Sat, Oct 11",8.4,281.0,"non-smoking rooms,free parking,restaurant,faci...",64080.0
3,Chisun Grand Takayama,Gifu,"Mon, Dec 1","Sat, Dec 6",8.7,3496.0,"non-smoking rooms,spa,private parking,free wif...",54276.0
4,Kyoto Pleasant Hotel,Kyoto,"Mon, Oct 6","Sat, Oct 11",8.4,153.0,"non-smoking rooms,free wifi,tea/coffee maker i...",51509.0


There are no null values remaining.

The check-in and check-out dates need to be broken down into month and day in month columns. Day in week is unnecessary since all instances start and end on the same day.

In [20]:
# append year to for conversion purposes
df['check_in_date'] = df['check_in_date'] + ', 2025'
df['check_in_date'] = pd.to_datetime(df['check_in_date'], format='%a, %b %d, %Y')

# create new columns and remove check_in_date
df['checkin_month'] = df['check_in_date'].dt.month
df['checkin_day'] = df['check_in_date'].dt.day
df.drop('check_in_date', axis=1, inplace=True)

In [21]:
# do the same for checkout
df['check_out_date'] = df['check_out_date'] + ', 2025'
df['check_out_date'] = pd.to_datetime(df['check_out_date'], format='%a, %b %d, %Y')

df['checkout_month'] = df['check_out_date'].dt.month
df['checkout_day'] = df['check_out_date'].dt.day
df.drop('check_out_date', axis=1, inplace=True)

In [22]:
df

Unnamed: 0,title,prefecture,review_score,review_count,popular_facilities,price,checkin_month,checkin_day,checkout_month,checkout_day
0,Prince Smart Inn Kyoto Sanjo,Kyoto,8.70,3505.00,"non-smoking rooms,restaurant,free wifi,24-hour...",40833.00,12,1,12,6
1,Hotel Japanesque Kyoto Station ZEQUU ANNEX,Kyoto,8.50,457.00,"non-smoking rooms,free wifi",58047.00,10,6,10,11
2,Ocean View Hiromi SPA Hotel温泉と絶景の宿,Shizuoka,8.40,281.00,"non-smoking rooms,free parking,restaurant,faci...",64080.00,10,6,10,11
3,Chisun Grand Takayama,Gifu,8.70,3496.00,"non-smoking rooms,spa,private parking,free wif...",54276.00,12,1,12,6
4,Kyoto Pleasant Hotel,Kyoto,8.40,153.00,"non-smoking rooms,free wifi,tea/coffee maker i...",51509.00,10,6,10,11
...,...,...,...,...,...,...,...,...,...,...
5927,APA Hotel Fukushima Ekimae,Fukushima,8.00,1776.00,"non-smoking rooms,restaurant,facilities for di...",18127.00,7,28,8,2
5928,Hotel Route-Inn Kamisu,Ibaraki,8.30,801.00,"non-smoking rooms,free parking,2 restaurants,f...",21465.00,7,21,7,26
5929,Comfort Hotel Hamamatsu,Shizuoka,7.80,1743.00,"non-smoking rooms,parking,free wifi,good break...",15971.00,7,14,7,19
5930,Hotel Route-Inn Fujieda-Eki Kita,Shizuoka,8.10,210.00,"non-smoking rooms,free parking,restaurant,free...",19850.00,7,14,7,19


The `popular_facilities` column needs to be broken down into multiple features.

In [23]:
# check for unique values in popular_facilities
facilities = {}
count = 0
for f in df['popular_facilities']:
    facilities_list = f.split(',')
    for facility in facilities_list:
        if facility not in facilities:
            facilities[facility] = 1
        else:
            facilities[facility] += 1
        count += 1
# sort by count values
dict(sorted(facilities.items(), key=lambda item: item[1]))

{'13 restaurants': 1,
 'fast free wifi 92 mbps)': 1,
 '3 swimming pools': 2,
 'pool – outdoor (kids)': 2,
 'hot tub/jacuzzi': 2,
 '3 swimming pools (2 open)': 2,
 'pool – indoor (kids)': 3,
 '10 restaurants': 3,
 'wifi in all areas': 3,
 '5 swimming pools (3 open)': 3,
 '5 swimming pools': 3,
 '11 restaurants': 4,
 '5 restaurants (4 open)': 4,
 'wifi': 6,
 '8 restaurants': 7,
 '2 swimming pools (1 open)': 8,
 '9 restaurants': 9,
 '7 restaurants': 29,
 '6 restaurants': 30,
 'skiing': 33,
 'private beach area': 40,
 'garden': 46,
 '5 restaurants': 51,
 '2 swimming pools': 63,
 '4 restaurants': 69,
 'beachfront': 86,
 'airport shuttle (free)': 93,
 'exceptional breakfast': 103,
 'indoor swimming pool': 107,
 'airport shuttle': 108,
 'terrace': 118,
 'hot spring bath': 148,
 'designated smoking area': 162,
 '3 restaurants': 163,
 'outdoor swimming pool': 228,
 'air conditioning': 286,
 'parking': 337,
 'parking on site': 347,
 'wonderful breakfast': 395,
 '2 restaurants': 408,
 'baggage st

In [24]:
print(f'Total number of facilities: {count}')

Total number of facilities: 37287


In [25]:
print(f'Unique facilities: {len(facilities)}')

Unique facilities: 62


The breakdown of important facilities of hotels from the dataset shows that some facilities can be consolidated or grouped together to reduce complexity and prevent wrong assumptions in ordinality and categorization, such as with `breakfast` and `parking`, respectively. Doing this also addresses under represented facilities.

In [26]:
# group other facilities
restaurant = [key for key in facilities.keys() if 'restaurant' in key]
wifi  = [key for key in facilities.keys() if 'wifi' in key]
pool = [key for key in facilities.keys() if 'pool' in key]
breakfast = [key for key in facilities.keys() if 'breakfast' in key]
beach = [key for key in facilities.keys() if 'beach' in key]
airport_shuttle = [key for key in facilities.keys() if 'shuttle' in key]
parking = [key for key in facilities.keys() if 'parking' in key]

grouped_features = [restaurant, wifi, pool, breakfast, airport_shuttle, beach, parking]

In [27]:
# check for remaining facilities ungrouped
test_dict = facilities.copy()
for feature in grouped_features:
    for cat in feature:
        if cat in test_dict.keys():
            del test_dict[cat]
test_dict

{'non-smoking rooms': 5749,
 '24-hour front desk': 1470,
 'laundry': 833,
 'garden': 46,
 'heating': 937,
 'daily housekeeping': 427,
 'facilities for disabled guests': 1730,
 'family rooms': 1671,
 'tea/coffee maker in all rooms': 916,
 'bar': 963,
 'spa': 653,
 'fitness center': 552,
 'room service': 476,
 'baggage storage': 423,
 'air conditioning': 286,
 'terrace': 118,
 'hot spring bath': 148,
 'elevator': 744,
 'designated smoking area': 162,
 'skiing': 33,
 'hot tub/jacuzzi': 2}

The remaining facilities can no longer be grouped. Since the facility 'hot tub/jacuzzi' cannot be grouped with other facilities and is under represented, it will not be added as a feature.

In [28]:
del test_dict['hot tub/jacuzzi']

In [29]:
facility_features = ['restaurant', 'wifi', 'pool', 'breakfast', 'airport_shuttle', 'beach', 'parking']
for key in test_dict.keys():
    feature = key.replace(' ','_')
    facility_features.append(feature)
facility_features

['restaurant',
 'wifi',
 'pool',
 'breakfast',
 'airport_shuttle',
 'beach',
 'parking',
 'non-smoking_rooms',
 '24-hour_front_desk',
 'laundry',
 'garden',
 'heating',
 'daily_housekeeping',
 'facilities_for_disabled_guests',
 'family_rooms',
 'tea/coffee_maker_in_all_rooms',
 'bar',
 'spa',
 'fitness_center',
 'room_service',
 'baggage_storage',
 'air_conditioning',
 'terrace',
 'hot_spring_bath',
 'elevator',
 'designated_smoking_area',
 'skiing']

In [30]:
len(facility_features)

27

The number of unique facilities has been reduced from 62 to 27, after grouping related facilities.
Now, we will add these features to `df` and if a facility feature is listed in an instance's `popular_facilities` then it will be set to 1 and if not to 0

In [31]:
for facility in facility_features:
    f = facility.replace('_',' ')
    df[facility] = [1 if f in x else 0 for x in df['popular_facilities']]
df

Unnamed: 0,title,prefecture,review_score,review_count,popular_facilities,price,checkin_month,checkin_day,checkout_month,checkout_day,restaurant,wifi,pool,breakfast,airport_shuttle,beach,parking,non-smoking_rooms,24-hour_front_desk,laundry,garden,heating,daily_housekeeping,facilities_for_disabled_guests,family_rooms,tea/coffee_maker_in_all_rooms,bar,spa,fitness_center,room_service,baggage_storage,air_conditioning,terrace,hot_spring_bath,elevator,designated_smoking_area,skiing
0,Prince Smart Inn Kyoto Sanjo,Kyoto,8.70,3505.00,"non-smoking rooms,restaurant,free wifi,24-hour...",40833.00,12,1,12,6,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Hotel Japanesque Kyoto Station ZEQUU ANNEX,Kyoto,8.50,457.00,"non-smoking rooms,free wifi",58047.00,10,6,10,11,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Ocean View Hiromi SPA Hotel温泉と絶景の宿,Shizuoka,8.40,281.00,"non-smoking rooms,free parking,restaurant,faci...",64080.00,10,6,10,11,1,1,0,1,0,0,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0
3,Chisun Grand Takayama,Gifu,8.70,3496.00,"non-smoking rooms,spa,private parking,free wif...",54276.00,12,1,12,6,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Kyoto Pleasant Hotel,Kyoto,8.40,153.00,"non-smoking rooms,free wifi,tea/coffee maker i...",51509.00,10,6,10,11,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5927,APA Hotel Fukushima Ekimae,Fukushima,8.00,1776.00,"non-smoking rooms,restaurant,facilities for di...",18127.00,7,28,8,2,1,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
5928,Hotel Route-Inn Kamisu,Ibaraki,8.30,801.00,"non-smoking rooms,free parking,2 restaurants,f...",21465.00,7,21,7,26,1,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5929,Comfort Hotel Hamamatsu,Shizuoka,7.80,1743.00,"non-smoking rooms,parking,free wifi,good break...",15971.00,7,14,7,19,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5930,Hotel Route-Inn Fujieda-Eki Kita,Shizuoka,8.10,210.00,"non-smoking rooms,free parking,restaurant,free...",19850.00,7,14,7,19,1,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


The `popular_facilities` can now be dropped.

In [32]:
df.drop('popular_facilities', axis=1, inplace=True)

# move price to last column
column = df.pop('price')
df['price'] = column

In [33]:
df.head()

Unnamed: 0,title,prefecture,review_score,review_count,checkin_month,checkin_day,checkout_month,checkout_day,restaurant,wifi,pool,breakfast,airport_shuttle,beach,parking,non-smoking_rooms,24-hour_front_desk,laundry,garden,heating,daily_housekeeping,facilities_for_disabled_guests,family_rooms,tea/coffee_maker_in_all_rooms,bar,spa,fitness_center,room_service,baggage_storage,air_conditioning,terrace,hot_spring_bath,elevator,designated_smoking_area,skiing,price
0,Prince Smart Inn Kyoto Sanjo,Kyoto,8.7,3505.0,12,1,12,6,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40833.0
1,Hotel Japanesque Kyoto Station ZEQUU ANNEX,Kyoto,8.5,457.0,10,6,10,11,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58047.0
2,Ocean View Hiromi SPA Hotel温泉と絶景の宿,Shizuoka,8.4,281.0,10,6,10,11,1,1,0,1,0,0,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,64080.0
3,Chisun Grand Takayama,Gifu,8.7,3496.0,12,1,12,6,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,54276.0
4,Kyoto Pleasant Hotel,Kyoto,8.4,153.0,10,6,10,11,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,51509.0


In [34]:
df.describe()

Unnamed: 0,review_score,review_count,checkin_month,checkin_day,checkout_month,checkout_day,restaurant,wifi,pool,breakfast,airport_shuttle,beach,parking,non-smoking_rooms,24-hour_front_desk,laundry,garden,heating,daily_housekeeping,facilities_for_disabled_guests,family_rooms,tea/coffee_maker_in_all_rooms,bar,spa,fitness_center,room_service,baggage_storage,air_conditioning,terrace,hot_spring_bath,elevator,designated_smoking_area,skiing,price
count,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0
mean,8.3,1531.31,9.48,14.25,9.5,15.58,0.6,0.88,0.07,0.84,0.03,0.02,0.75,0.97,0.25,0.14,0.01,0.16,0.07,0.29,0.28,0.15,0.16,0.11,0.09,0.08,0.07,0.05,0.02,0.02,0.13,0.03,0.01,36211.91
std,0.59,1750.94,1.75,8.9,1.86,8.85,0.49,0.32,0.26,0.37,0.18,0.13,0.44,0.17,0.43,0.35,0.09,0.36,0.26,0.45,0.45,0.36,0.37,0.31,0.29,0.27,0.26,0.21,0.14,0.16,0.33,0.16,0.07,27376.53
min,5.7,1.0,7.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8341.0
25%,7.9,397.0,8.0,7.0,8.0,8.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20999.0
50%,8.4,1006.0,9.0,14.0,10.0,15.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27998.0
75%,8.7,2071.0,11.0,22.0,11.0,23.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40791.0
max,10.0,23095.0,12.0,29.0,12.0,30.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,592545.0


`review_score` and `price` have large standard deviations. Outliers need to be addressed.

### Feature Selection

Univariance selection will be performed to determine how significant are the current features for predicting price. But first, categorical values need to be encoded.

In [35]:
df_copy = df.copy()
df_copy.drop('title', axis=1, inplace=True)

In [36]:
encoder = TargetEncoder(categories='auto', target_type='continuous', smooth='auto', cv=30, random_state=42)
df_copy['prefecture'] = encoder.fit_transform(df_copy[['prefecture']], df_copy['price'])

`prefecture` has been encoded into numerical values. TargetEncoder encodes categorical values based on the mean of the target variable.

In [37]:
df_copy.head(5)

Unnamed: 0,prefecture,review_score,review_count,checkin_month,checkin_day,checkout_month,checkout_day,restaurant,wifi,pool,breakfast,airport_shuttle,beach,parking,non-smoking_rooms,24-hour_front_desk,laundry,garden,heating,daily_housekeeping,facilities_for_disabled_guests,family_rooms,tea/coffee_maker_in_all_rooms,bar,spa,fitness_center,room_service,baggage_storage,air_conditioning,terrace,hot_spring_bath,elevator,designated_smoking_area,skiing,price
0,46954.61,8.7,3505.0,12,1,12,6,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40833.0
1,47062.16,8.5,457.0,10,6,10,11,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58047.0
2,28455.66,8.4,281.0,10,6,10,11,1,1,0,1,0,0,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,64080.0
3,29669.58,8.7,3496.0,12,1,12,6,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,54276.0
4,46948.37,8.4,153.0,10,6,10,11,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,51509.0


Now that all features are numerical and have been adjusted, we can now perform feature selection.

In [38]:
X = df_copy.drop('price', axis=1)
y = df_copy['price']

In [39]:
best_features = SelectKBest(score_func=chi2, k='all')
fit = best_features.fit(X,y)

df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)

feature_scores = pd.concat([df_columns, df_scores], axis=1)
feature_scores.columns = ['Feature','Score']
print(feature_scores.sort_values(by='Score', ascending=False))

                           Feature       Score
0                       prefecture 16191537.02
2                     review_count 10260242.86
4                      checkin_day    21766.24
6                     checkout_day    19672.09
29                         terrace     5173.79
17                          garden     5169.37
12                           beach     5072.02
11                 airport_shuttle     5059.73
19              daily_housekeeping     4895.06
25                  fitness_center     4632.26
26                    room_service     4555.09
33                          skiing     4552.98
30                 hot_spring_bath     4536.66
27                 baggage_storage     4424.16
31                        elevator     4408.75
9                             pool     4391.05
16                         laundry     4303.81
28                air_conditioning     4222.36
23                             bar     4204.16
32         designated_smoking_area     4186.42
18           

The most significant features are `prefecture` and `review_count`. The least significant features include `breakfast`, `wifi`, `review_score`, and `non-smoking_rooms`, which suggests that these features can be dropped, considering they do not have significant contributions in determining the target variable.

In [40]:
df.drop(['breakfast','wifi','review_score','non-smoking_rooms'], axis=1, inplace=True)
df.head()

Unnamed: 0,title,prefecture,review_count,checkin_month,checkin_day,checkout_month,checkout_day,restaurant,pool,airport_shuttle,beach,parking,24-hour_front_desk,laundry,garden,heating,daily_housekeeping,facilities_for_disabled_guests,family_rooms,tea/coffee_maker_in_all_rooms,bar,spa,fitness_center,room_service,baggage_storage,air_conditioning,terrace,hot_spring_bath,elevator,designated_smoking_area,skiing,price
0,Prince Smart Inn Kyoto Sanjo,Kyoto,3505.0,12,1,12,6,1,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40833.0
1,Hotel Japanesque Kyoto Station ZEQUU ANNEX,Kyoto,457.0,10,6,10,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58047.0
2,Ocean View Hiromi SPA Hotel温泉と絶景の宿,Shizuoka,281.0,10,6,10,11,1,0,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,64080.0
3,Chisun Grand Takayama,Gifu,3496.0,12,1,12,6,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,54276.0
4,Kyoto Pleasant Hotel,Kyoto,153.0,10,6,10,11,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,51509.0


This is the end of data preprocessing. All columns have been broken down into smaller components, there are no more null values, and the data types of numerical values have been addressed. Feature selection was also performed to reduce dimensionality.

In [41]:
df.describe()

Unnamed: 0,review_count,checkin_month,checkin_day,checkout_month,checkout_day,restaurant,pool,airport_shuttle,beach,parking,24-hour_front_desk,laundry,garden,heating,daily_housekeeping,facilities_for_disabled_guests,family_rooms,tea/coffee_maker_in_all_rooms,bar,spa,fitness_center,room_service,baggage_storage,air_conditioning,terrace,hot_spring_bath,elevator,designated_smoking_area,skiing,price
count,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0,5931.0
mean,1531.31,9.48,14.25,9.5,15.58,0.6,0.07,0.03,0.02,0.75,0.25,0.14,0.01,0.16,0.07,0.29,0.28,0.15,0.16,0.11,0.09,0.08,0.07,0.05,0.02,0.02,0.13,0.03,0.01,36211.91
std,1750.94,1.75,8.9,1.86,8.85,0.49,0.26,0.18,0.13,0.44,0.43,0.35,0.09,0.36,0.26,0.45,0.45,0.36,0.37,0.31,0.29,0.27,0.26,0.21,0.14,0.16,0.33,0.16,0.07,27376.53
min,1.0,7.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8341.0
25%,397.0,8.0,7.0,8.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20999.0
50%,1006.0,9.0,14.0,10.0,15.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27998.0
75%,2071.0,11.0,22.0,11.0,23.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40791.0
max,23095.0,12.0,29.0,12.0,30.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,592545.0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5931 entries, 0 to 5931
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   title                           5931 non-null   object 
 1   prefecture                      5931 non-null   object 
 2   review_count                    5931 non-null   float64
 3   checkin_month                   5931 non-null   int32  
 4   checkin_day                     5931 non-null   int32  
 5   checkout_month                  5931 non-null   int32  
 6   checkout_day                    5931 non-null   int32  
 7   restaurant                      5931 non-null   int64  
 8   pool                            5931 non-null   int64  
 9   airport_shuttle                 5931 non-null   int64  
 10  beach                           5931 non-null   int64  
 11  parking                         5931 non-null   int64  
 12  24-hour_front_desk              5931 no

In [43]:
df.isna().sum()

title                             0
prefecture                        0
review_count                      0
checkin_month                     0
checkin_day                       0
checkout_month                    0
checkout_day                      0
restaurant                        0
pool                              0
airport_shuttle                   0
beach                             0
parking                           0
24-hour_front_desk                0
laundry                           0
garden                            0
heating                           0
daily_housekeeping                0
facilities_for_disabled_guests    0
family_rooms                      0
tea/coffee_maker_in_all_rooms     0
bar                               0
spa                               0
fitness_center                    0
room_service                      0
baggage_storage                   0
air_conditioning                  0
terrace                           0
hot_spring_bath             

In [44]:
# Save data to csv
df.to_csv('cleaned_dataset.csv', index=False)
print("Cleaning complete. File saved as 'cleaned_dataset.csv'")

Cleaning complete. File saved as 'cleaned_dataset.csv'
