In [1]:
import pandas as pd
import numpy as np

In [2]:
hotel_df = pd.read_csv('./data/google_hotel_data_raw.csv')

In [3]:
hotel_df.head()

Unnamed: 0,Hotel_Name,Hotel_Rating,City,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Hotel_Price
0,Trident Hotel Cochin,4.4,kochi,5-star hotel,Free breakfast,Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Restaurant,"₹7,246"
1,The Gateway Hotel Marine Drive Ernakulam,4.3,kochi,5-star hotel,Free breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Restaurant,"₹9,440"
2,Ramada by Wyndham Kochi,4.5,kochi,5-star hotel,Breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Bar,"₹8,964"
3,The Renai cochin,4.2,kochi,4-star hotel,Free breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Bar,"₹2,768"
4,SpringField Billets Hotel,4.2,kochi,3-star hotel,Breakfast ($),Free Wi-Fi,Free parking,Kitchen in some rooms,Airport shuttle,Full-service laundry,Business center,Kid-friendly,"₹1,340"


In [4]:
hotel_df.isna().sum()

Hotel_Name        0
Hotel_Rating     44
City              0
Feature_1        67
Feature_2        70
Feature_3        71
Feature_4        76
Feature_5        82
Feature_6       109
Feature_7       146
Feature_8       199
Feature_9       282
Hotel_Price      79
dtype: int64

In [5]:
# Remove the rows with missing hotel price values

hotel_df = hotel_df.dropna(subset=['Hotel_Price'])

In [6]:
# Hotel Price is a string (with rupee sign and comma), converting it to float

hotel_df['Hotel_Price'] = hotel_df['Hotel_Price'].str.replace('₹', '').str.replace(',', '').astype(float)
hotel_df.head()

Unnamed: 0,Hotel_Name,Hotel_Rating,City,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Hotel_Price
0,Trident Hotel Cochin,4.4,kochi,5-star hotel,Free breakfast,Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Restaurant,7246.0
1,The Gateway Hotel Marine Drive Ernakulam,4.3,kochi,5-star hotel,Free breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Restaurant,9440.0
2,Ramada by Wyndham Kochi,4.5,kochi,5-star hotel,Breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Bar,8964.0
3,The Renai cochin,4.2,kochi,4-star hotel,Free breakfast,Free Wi-Fi,Free parking,Outdoor pool,Air conditioning,Fitness center,Spa,Bar,2768.0
4,SpringField Billets Hotel,4.2,kochi,3-star hotel,Breakfast ($),Free Wi-Fi,Free parking,Kitchen in some rooms,Airport shuttle,Full-service laundry,Business center,Kid-friendly,1340.0


In [7]:
hotel_features = ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9']

hotel_df[hotel_features] = hotel_df[hotel_features].fillna(0)

In [8]:
# removing the NAs from ratings column

hotel_df_remove_rating = hotel_df.dropna(subset=['Hotel_Rating'])
print(hotel_df_remove_rating.shape)
print(hotel_df_remove_rating.isna().sum())

(1073, 13)
Hotel_Name      0
Hotel_Rating    0
City            0
Feature_1       0
Feature_2       0
Feature_3       0
Feature_4       0
Feature_5       0
Feature_6       0
Feature_7       0
Feature_8       0
Feature_9       0
Hotel_Price     0
dtype: int64


In [9]:
features = set()

for x in hotel_features:
    features.update(hotel_df_remove_rating[x].unique())

for feature in features:
    print(feature)

0
Not wheelchair accessible
5 bathrooms
6 bedrooms
No ironing board
Not pet-friendly
Elevator
1 bedroom
35 sq m
Beach access
Sleeps 3
2 bedrooms
Sleeps 20
Smoke-free property
Sleeps 7
5-star hotel
Breakfast ($)
Free parking
3 sq m
Kitchen in some rooms
Room service
No beach access
Restaurant
3 bedrooms
No air conditioning
Sleeps 16
House
2 bathrooms
Bungalow
Sleeps 5
Wheelchair accessible
4 bathrooms
Parking
2-star hotel
Accessible
Hot tub
1 bed
Kitchen in rooms
7 bathrooms
Sleeps 12
Pet-friendly
4 bedrooms
Kitchen
Villa
Parking ($)
28 bedrooms
Free breakfast
No airport shuttle
Free Wi-Fi
Sleeps 10
Breakfast
Full-service laundry
Air conditioning
Cable TV
Fitness center
Sleeps 9
Business center
Indoor pool
Paid parking
3-star hotel
Bar
Sleeps 6
Sleeps 17
Sleeps 11
8 bedrooms
Sleeps 2
1 bathroom
No elevator
No outdoor grill
1-star hotel
No fireplace
Kid-friendly
Wi-Fi
4 sq m
Sleeps 4
Spa
Airport shuttle
No fitness center
Sleeps 8
Outdoor pool
Fireplace
9 bathrooms
Apartment
46 sq m
Not s

In [10]:
# cleaning the feature about 'Pools'

# List of values to replace
to_replace = ['Pools', 'Outdoor pool', 'Indoor pool']

replacement = 'Pool'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [11]:
# cleaning the feature about 'parking'

# List of values to replace
to_replace = ['Parking ($)', 'Parking']

replacement = 'Paid parking'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [12]:
# cleaning the feature about 'kitchen'

# List of values to replace
to_replace = ['Kitchen in rooms', 'Kitchen in some rooms']

replacement = 'Kitchen'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [13]:
# cleaning the feature about 'smoking'

# List of values to replace
to_replace = ['Smoke-free property']

replacement = 'Smoke-free'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [14]:
# cleaning the feature about 'breakfast'

# List of values to replace
to_replace = ['Breakfast ($)']

replacement = 'Breakfast'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [15]:
# cleaning the feature about 'wifi'

# List of values to replace
to_replace = ['Wi-Fi ($)']

replacement = 'Wi-Fi'

for feature in hotel_features:
    hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_df_remove_rating[feature] = hotel_df_remove_rating[feature].replace(to_replace, replacement)


In [None]:
hotel_df_remove_rating.to_csv('./data/google_hotel_data_clean_v1.csv', index=False)