In [1]:
import pandas as pd
import numpy as np

In [2]:
hotel_df = pd.read_csv('./data/google_hotel_data_raw_v2.csv')

In [3]:
hotel_df.head()

Unnamed: 0,Hotel_Name,Hotel_Rating,City,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Hotel_Price
0,Crowne Plaza Kochi,4.6,kochi,5-star hotel,Free breakfast,Free Wi-Fi,Free parking,Outdoor pool,Hot tub,Air conditioning,Fitness center,Spa,"₹8,854"
1,Trident Hotel Cochin,4.5,kochi,5-star hotel,Free breakfast,Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Restaurant,"₹6,441"
2,The Galaxy Suites,3.8,kochi,Apartment,Sleeps 10,Free parking,Free Wi-Fi,No air conditioning,No airport shuttle,No beach access,No elevator,No fireplace,₹831
3,The Renai cochin,4.2,kochi,4-star hotel,Free breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Bar,"₹2,768"
4,Ramada by Wyndham Kochi,4.5,kochi,5-star hotel,Breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Bar,"₹8,938"


In [4]:
hotel_df.isna().sum()

Hotel_Name        0
Hotel_Rating     50
City              0
Feature_1        72
Feature_2        78
Feature_3        78
Feature_4        82
Feature_5        88
Feature_6       109
Feature_7       140
Feature_8       200
Feature_9       293
Hotel_Price      71
dtype: int64

In [5]:
# Remove the rows with missing hotel price values

hotel_df = hotel_df.dropna(subset=['Hotel_Price'])

In [6]:
# Hotel Price is a string (with rupee sign and comma), converting it to float

hotel_df['Hotel_Price'] = hotel_df['Hotel_Price'].str.replace('₹', '').str.replace(',', '').astype(float)
hotel_df.head()

Unnamed: 0,Hotel_Name,Hotel_Rating,City,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Hotel_Price
0,Crowne Plaza Kochi,4.6,kochi,5-star hotel,Free breakfast,Free Wi-Fi,Free parking,Outdoor pool,Hot tub,Air conditioning,Fitness center,Spa,8854.0
1,Trident Hotel Cochin,4.5,kochi,5-star hotel,Free breakfast,Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Restaurant,6441.0
2,The Galaxy Suites,3.8,kochi,Apartment,Sleeps 10,Free parking,Free Wi-Fi,No air conditioning,No airport shuttle,No beach access,No elevator,No fireplace,831.0
3,The Renai cochin,4.2,kochi,4-star hotel,Free breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Bar,2768.0
4,Ramada by Wyndham Kochi,4.5,kochi,5-star hotel,Breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Bar,8938.0


In [7]:
hotel_features = ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9']

hotel_df[hotel_features] = hotel_df[hotel_features].fillna(0)

In [8]:
# removing the NAs from ratings column

hotel_df_remove_rating = hotel_df.dropna(subset=['Hotel_Rating'])
print(hotel_df_remove_rating.shape)
print(hotel_df_remove_rating.isna().sum())

(1101, 13)
Hotel_Name      0
Hotel_Rating    0
City            0
Feature_1       0
Feature_2       0
Feature_3       0
Feature_4       0
Feature_5       0
Feature_6       0
Feature_7       0
Feature_8       0
Feature_9       0
Hotel_Price     0
dtype: int64


In [9]:
features = set()

for x in hotel_features:
    features.update(hotel_df_remove_rating[x].unique())

for feature in features:
    print(feature)

0
Sleeps 12
2 bathrooms
Balcony
Air conditioning
1 bed
No airport shuttle
5-star hotel
Fireplace
Wi-Fi
Kitchen in rooms
Apartment
Smoke-free property
Spa
Sleeps 6
Beach access
Kid-friendly
Bar
Pools
Sleeps 4
Parking
Sleeps 8
Sleeps 10
Sleeps 2
1 bathroom
3-star hotel
Sleeps 13
Smoke-free
Fitness center
No fitness center
Crib
Sleeps 17
Paid parking
House
Wi-Fi ($)
4-star hotel
Parking ($)
Sleeps 9
12 bedrooms
6 bedrooms
3 bathrooms
Outdoor pool
5 bathrooms
28 bedrooms
Kitchen in some rooms
Pool
4 bedrooms
Restaurant
12 bathrooms
Not pet-friendly
No air conditioning
Hot tub
Not wheelchair accessible
1 bedroom
Sleeps 24
Sleeps 3
Villa
Free parking
Sleeps 7
No elevator
1-star hotel
No ironing board
2 bedrooms
Heating
Free Wi-Fi
No crib
Kitchen
Pet-friendly
7 bedrooms
4 bathrooms
Not smoke-free
Golf
Sleeps 11
7 bathrooms
Sleeps 5
No beach access
No fireplace
2 beds
Free breakfast
Sleeps 28
Accessible
Breakfast
Breakfast ($)
Bungalow
Full-service laundry
Elevator
Business center
Wheelchair a

In [10]:
# cleaning the feature about 'Pools'

# List of values to replace
to_replace = ['Pools', 'Outdoor pool', 'Indoor pool']

replacement = 'Pool'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [11]:
# cleaning the feature about 'parking'

# List of values to replace
to_replace = ['Parking ($)', 'Parking']

replacement = 'Paid parking'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [12]:
# cleaning the feature about 'kitchen'

# List of values to replace
to_replace = ['Kitchen in rooms', 'Kitchen in some rooms']

replacement = 'Kitchen'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [13]:
# cleaning the feature about 'smoking'

# List of values to replace
to_replace = ['Smoke-free property']

replacement = 'Smoke-free'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [14]:
# cleaning the feature about 'breakfast'

# List of values to replace
to_replace = ['Breakfast ($)']

replacement = 'Breakfast'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [15]:
# cleaning the feature about 'wifi'

# List of values to replace
to_replace = ['Wi-Fi ($)']

replacement = 'Wi-Fi'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [23]:
# Dataset for kaggle
hotel_df_remove_rating.to_csv('./data/google_hotel_data_clean_v2.csv', index=False)