In [None]:
import pandas as pd
import re

hotels_df = pd.read_csv('Hotels.csv', encoding='ISO-8859-1')

print("Columns before dropping null columns:")
print(hotels_df.columns.tolist())

# Dropping the 'room_type' and 'bed_type' columns 
columns_to_drop = ['room_type', 'bed_type']
hotels_df.drop(columns=[col for col in columns_to_drop if col in hotels_df.columns], inplace=True)

# Verify that the columns have been dropped
print("\nColumns after dropping specified columns:")
print(hotels_df.columns.tolist())


print("\nFirst few rows after dropping columns:")
print(hotels_df.head())

In [None]:
# Ensure the Reviews_count column is treated as a string
hotels_df['Reviews_count'] = hotels_df['Reviews_count'].astype(str)

# Removing the word 'Reviews' from the Reviews_count column
hotels_df['Reviews_count'] = hotels_df['Reviews_count'].str.replace(' Reviews', '', regex=False)

# Verify the changes
print(hotels_df[['Reviews_count']].head())


In [None]:
# Converting the column to numeric, setting errors='coerce' to handle non-convertible values as NaN
hotels_df['Reviews_count'] = pd.to_numeric(hotels_df['Reviews_count'], errors='coerce')

# Filling NaN values with 0
hotels_df['Reviews_count'] = hotels_df['Reviews_count'].fillna(0)

# Converting the Reviews_count column to integer
hotels_df['Reviews_count'] = hotels_df['Reviews_count'].astype(int)


print(hotels_df[['Reviews_count']].head())

print()

In [None]:
# Converting to string type (if it's not already)
hotels_df['AverageRoomRateUSD'] = hotels_df['AverageRoomRateUSD'].astype(str)

# Removing the dollar sign from the AverageRoomRateUSD column
hotels_df['AverageRoomRateUSD'] = hotels_df['AverageRoomRateUSD'].str.replace('$', '', regex=False)

# Converting the column to numeric (float) after removing the dollar sign
hotels_df['AverageRoomRateUSD'] = pd.to_numeric(hotels_df['AverageRoomRateUSD'], errors='coerce')

# Filling NaN values with 0 if needed (optional)
hotels_df['AverageRoomRateUSD'] = hotels_df['AverageRoomRateUSD'].fillna(0)

# Converting the AverageRoomRateUSD column to integer
hotels_df['AverageRoomRateUSD'] = hotels_df['AverageRoomRateUSD'].astype(int)


print(hotels_df[['AverageRoomRateUSD']].head())



In [None]:
# Converting the DistanceToAirport column to string (if it isn't already)
hotels_df['DistanceToAirport'] = hotels_df['DistanceToAirport'].astype(str)

# Removing 'km' from the DistanceToAirport column
hotels_df['DistanceToAirport'] = hotels_df['DistanceToAirport'].str.replace('km', '', regex=False)

# Converting the cleaned DistanceToAirport column to float
hotels_df['DistanceToAirport'] = pd.to_numeric(hotels_df['DistanceToAirport'], errors='coerce')

# Converting distance_to_center to float directly (if it contains no unwanted text)
hotels_df['distance_to_center'] = pd.to_numeric(hotels_df['distance_to_center'], errors='coerce')

# Filling NaN values with 0 for both columns if needed
hotels_df['DistanceToAirport'] = hotels_df['DistanceToAirport'].fillna(0)
hotels_df['distance_to_center'] = hotels_df['distance_to_center'].fillna(0)


print(hotels_df[['DistanceToAirport', 'distance_to_center']].head())





In [None]:


# Printing available columns to check if 'HotelAmenities' exists
print("Columns in DataFrame:", hotels_df.columns)

frequent_amenities = [
    'Free Wi-Fi', 'Spa', 'Fitness Center', 'Restaurant', 'Bar',
    'Conference Facilities', 'Non-Smoking Rooms', 'Sauna', 'Business Center',
    'Outdoor Pool', 'Gym', 'Indoor swimming pool'
]

# Function to normalize amenity strings
def normalize_amenities(amenities):
    # Convert to lowercase
    amenities = amenities.lower()
    # Remove extra spaces and standardize terms
    amenities = re.sub(r'\s+', ' ', amenities).strip()  # Replace multiple spaces with a single space
    return amenities

# Checking if 'HotelAmenities' exists before proceeding
if 'HotelAmenities' in hotels_df.columns:
    # Step 1: Normalize the amenities column
    hotels_df['NormalizedAmenities'] = hotels_df['HotelAmenities'].apply(normalize_amenities)

    # Step 2: Creating a binary representation of amenities
    for amenity in frequent_amenities:
        normalized_amenity = amenity.lower()
        hotels_df[amenity] = hotels_df['NormalizedAmenities'].apply(lambda x: 1 if normalized_amenity in x else 0)

    # Step 3: Dropping the original HotelAmenities column if not needed
    hotels_df.drop(columns=['HotelAmenities', 'NormalizedAmenities'], inplace=True)

    # Show the resulting DataFrame
    print(hotels_df)
else:
    print("The 'HotelAmenities' column does not exist in the DataFrame.")


In [10]:
# Converting HotelID and HotelStarRating to string
hotels_df['HotelID'] = hotels_df['HotelID'].astype(str)
hotels_df['HotelStarRating'] = hotels_df['HotelStarRating'].astype(str)

# Cleaning and convert Reviews_count to int
hotels_df['Reviews_count'] = hotels_df['Reviews_count'].astype(int)

# Cleaning and convert AverageRoomRateUSD to int
hotels_df['AverageRoomRateUSD'] = hotels_df['AverageRoomRateUSD'].astype(int)


# Converting LastRenovationDate to int (assuming it's already in year format)
hotels_df['LastRenovationDate'] = hotels_df['LastRenovationDate'].astype(int)

# Cleaning and convert DistanceToAirport to int
hotels_df['DistanceToAirport'] = hotels_df['DistanceToAirport'].astype(int)


In [None]:
# Delete the CheckInTime and CheckOutTime columns
hotels_df.drop(columns=['CheckInTime', 'CheckOutTime'], inplace=True)

# Verify that the columns have been deleted
print(hotels_df.columns)


In [None]:

print(hotels_df.dtypes)

In [12]:
hotels_df.to_csv('Cleaned_Hotels_data.csv', index=False)