In [172]:
import pandas as pd
import numpy as np

In [173]:
df = pd.read_csv('BookingScraperAM.csv')

### Convert Price AMD to numeric.

In [174]:
# Convert Price AMD to numeric.
X = df.drop(columns = ["Price AMD", "Languages spoken",'Name Of Hotel','Outdoor swimming pool', 'Spa', 'Activities'])
y = df["Price AMD"]
#X.columns[X.isnull().any()]

### Handle missing values.

In [175]:
X.fillna({"Rating": X["Rating"].mean()}, inplace=True)
X.fillna({"Activities": "Doesn't have"}, inplace=True)
X.fillna({"Parking": "Doesn't have"}, inplace=True)
X.fillna({"Wellness": "Doesn't have"}, inplace=True)

In [176]:
# X.Location - one hot encoder
# X.Breakfast - label encoder
# X.Room - LLM encoder
# X.Outdoors - LLM encoder
# X.Ski - LLM
# X['Food & Drink'] - LLM
# X['Internet'] - one hot encoder, mapping
# X['Parking'] - LLm
# X['Wellness'] - LLM


In [177]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# # X = np.array(ct.fit_transform(X))

### Encoding Location

In [178]:
X = pd.get_dummies(X, columns=['Location'])
X = X.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

### Encoding Breakfast

In [179]:
X['Breakfast'] = X['Breakfast'].apply(lambda x: 0 if x == 'Not Included' else 1)

### Encoding Room

In [180]:
# removing Room word
X.Room = X.Room.str.replace('Room', '', regex=False)

In [181]:
def splitter(arr, item = ' ') -> pd.DataFrame:
    split_data = [x.split(item) for x in arr]
    split_data = [[item.strip() for item in sublist] for sublist in split_data]
    from sklearn.preprocessing import MultiLabelBinarizer
    
    mlb = MultiLabelBinarizer()
    
    # Step 4: Fit and transform the data to get the one-hot encoded matrix
    encoded_data = mlb.fit_transform(split_data)
    encoded_data = pd.DataFrame(encoded_data, columns=mlb.classes_)
    return encoded_data

In [182]:
encoded_data = splitter(X.Room ,' ')
encoded_data = encoded_data.drop(columns=['', '(2', '+', '-', '1','and', 'in', 'or', 'room,', 'with', 'without'])

In [183]:
X = X.drop(columns = ["Room"])
# add encoded data to X
X = pd.concat([X, encoded_data], axis = 1)
#fill NaN values
X.columns[X.isnull().any()]
X.fillna(0, inplace=True)

### Encoding Outdoors

In [184]:
encoded_data1 = splitter(X.Outdoors ,' | ')
encoded_data1 = splitter(encoded_data1, ' ')
encoded_data1 = encoded_data1.drop(columns=["Doesn't", 'Outdoor'])

In [185]:
# add encoded data to X, fill NaN values
X = X.drop(columns = ["Outdoors"])
X = pd.concat([X, encoded_data1], axis = 1)
X.columns[X.isnull().any()]
X.fillna(0, inplace=True)

### Encoding Ski

In [186]:
X.Ski.unique()

array(['Ski-to-door access | Ski equipment hire on site | Ski storage',
       "Doesn't have",
       'Ski pass vendor | Ski equipment hire on site | Ski storage (Additional charge)',
       'Ski storage', 'Ski school (Additional charge)',
       'Ski pass vendor | Ski equipment hire on site',
       'Ski equipment hire on site | Ski storage',
       'Ski-to-door access | Ski equipment hire on site | Ski school (Additional charge) | Ski storage',
       'Ski-to-door access | Ski storage',
       'Ski pass vendor | Ski equipment hire on site | Ski school (Additional charge) | Ski storage',
       'Ski equipment hire on site | Ski school | Ski storage',
       'Ski equipment hire on site | Ski storage (Additional charge)',
       'Ski-to-door access | Ski pass vendor | Ski equipment hire on site | Ski school (Additional charge) | Ski storage'],
      dtype=object)

In [187]:
encoded_data2 = splitter(X.Ski ,' | ')
encoded_data2 = splitter(encoded_data2, ' ')
encoded_data2 = encoded_data2.drop(columns=["Doesn't", '(Additional', 'have', 'vendor'])

In [188]:
# add encoded data to X, fill NaN values
X = X.drop(columns = ["Ski"])
X = pd.concat([X, encoded_data2], axis = 1)
X.columns[X.isnull().any()]
X.fillna(0, inplace=True)

### Encoding Food & Drink

In [189]:
encoded_data3 = splitter(X['Food & Drink'] ,' | ')
encoded_data3 = splitter(encoded_data3, ' ')
encoded_data3 = splitter(encoded_data3, '(')
encoded_data3 = splitter(encoded_data3, ')')
encoded_data3 = encoded_data3.drop(columns=['', "Doesn't", 'have', 'in', 'on', 'the'])

In [190]:
encoded_data3.columns.unique()

Index(['Additional', 'Bar', 'Breakfast', 'Coffee', 'Fruits', 'Grocery', 'Kid',
       'Kid-friendly', 'Minibar', 'Packed', 'Restaurant', 'Room', 'Snack',
       'Special', 'Tea/Coffee', 'Vending', 'Wine/champagne', 'bar', 'buffet',
       'charge', 'deliveries', 'diet', 'drinks', 'house', 'lunches', 'machine',
       'maker', 'meals', 'menus', 'request', 'room', 'service', 'site',
       'snacks'],
      dtype='object')

In [191]:
# add encoded data to X, fill NaN values
X = X.drop(columns = ['Food & Drink'])
X = pd.concat([X, encoded_data3], axis = 1)
X.columns[X.isnull().any()]
X.fillna(0, inplace=True)

### Encoding Internet

In [192]:
X.Internet.unique()

array(['WiFi is available in public areas and is free of charge.',
       'WiFi is available in all areas and is free of charge.',
       'WiFi is available in the hotel rooms and is free of charge.',
       'WiFi is available in some hotel rooms and is free of charge.',
       'No internet access available.',
       'WiFi is available in the hotel rooms and charges are applicable.'],
      dtype=object)

In [193]:
# data = ['No internet access available.', 'WiFi is available in the hotel rooms and charges are applicable.', 
#         'WiFi is available in public areas and is free of charge.', 'WiFi is available in some hotel rooms and is free of charge.',
#         'WiFi is available in the hotel rooms and is free of charge.', 'WiFi is available in all areas and is free of charge.']  # Your list

unique_labels = list(dict.fromkeys(X.Internet))
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

X['Internet'] = X['Internet'].map(label_mapping)

### Encoding Parking

In [205]:
X.Parking.unique()

array(['Accessible parking | Parking garage',
       'Accessible parking | Parking garage | Secured parking',
       'Accessible parking | Street parking', "Doesn't have",
       'Parking garage', 'Street parking | Secured parking',
       'Street parking',
       'Accessible parking | Street parking | Secured parking',
       'Electric vehicle charging station | Street parking',
       'Accessible parking',
       'Electric vehicle charging station | Street parking | Secured parking',
       'Electric vehicle charging station | Secured parking',
       'Secured parking', 'Parking garage | Secured parking',
       'Electric vehicle charging station | Parking garage | Secured parking',
       'Accessible parking | Electric vehicle charging station',
       'Accessible parking | Electric vehicle charging station | Street parking | Secured parking',
       'Accessible parking | Parking garage | Street parking | Secured parking',
       'Accessible parking | Electric vehicle charging stati

In [208]:
encoded_data4 = splitter(X.Parking ,' | ')
encoded_data4 = splitter(encoded_data4, ' ')
encoded_data4 = encoded_data4.drop(columns=["Doesn't", 'have', 'Parking', 'parking', 'vehicle'])

In [209]:
encoded_data4.columns

Index(['Accessible', 'Electric', 'Secured', 'Street', 'charging', 'garage',
       'station'],
      dtype='object')

In [210]:
# add encoded data to X, fill NaN values
X = X.drop(columns = ['Parking'])
X = pd.concat([X, encoded_data4], axis = 1)
X.columns[X.isnull().any()]
X.fillna(0, inplace=True)

### Encoding Parking

In [213]:
X.Wellness.unique()

array(['Sun loungers or beach chairs | Hot tub/Jacuzzi (Additional charge) | Spa and wellness centre (Additional charge) | Fitness centre | Sauna',
       "Doesn't have",
       'Fitness/spa locker rooms | Personal trainer | Fitness classes | Fitness | Sun loungers or beach chairs | Hammam (Additional charge) | Fitness centre | Sauna (Additional charge)',
       'Full body massage | Hand massage | Head massage | Couples massage | Foot massage | Neck massage | Back massage | Massage | Sauna (Additional charge)',
       'Fitness/spa locker rooms | Personal trainer | Fitness classes | Yoga classes | Fitness | Full body massage (Additional charge) | Hand massage (Additional charge) | Head massage (Additional charge) | Couples massage (Additional charge) | Foot massage (Additional charge) | Neck massage (Additional charge) | Back massage (Additional charge) | Spa/wellness packages | Foot bath | Spa lounge/relaxation area | Steam room | Spa facilities | Light therapy | Body wrap | Body scrub

In [223]:
encoded_data5 = splitter(X.Wellness ,' | ')
encoded_data5 = splitter(encoded_data5, ' ')
encoded_data5 = splitter(encoded_data5, '(')
encoded_data5 = splitter(encoded_data5, ')')
encoded_data5 = encoded_data5.drop(columns=['',"Doesn't", 'have', 'and', 'chair', 'area', 'or','up'])

In [224]:
encoded_data5.columns.unique()

Index(['Additional', 'Back', 'Bath', 'Beauty', 'Body', 'Couples', 'Facial',
       'Fitness', 'Fitness/spa', 'Foot', 'Full', 'Hair', 'Hammam', 'Hand',
       'Head', 'Hot', 'Light', 'Make', 'Manicure', 'Massage', 'Neck',
       'Open-air', 'Pedicure', 'Personal', 'Public', 'Sauna', 'Services',
       'Solarium', 'Spa', 'Spa/wellness', 'Steam', 'Sun', 'Temporarily',
       'Water', 'Waxing', 'Yoga', 'bath', 'beach', 'body', 'centre', 'chairs',
       'charge', 'classes', 'closed', 'colouring', 'cut', 'facilities',
       'locker', 'lounge/relaxation', 'loungers', 'massage', 'packages',
       'room', 'rooms', 'scrub', 'services', 'slide', 'spring', 'styling',
       'therapy', 'trainer', 'treatments', 'tub/Jacuzzi', 'umbrellas',
       'wellness', 'wrap'],
      dtype='object')

In [225]:
# add encoded data to X, fill NaN values
X = X.drop(columns = ['Wellness'])
X = pd.concat([X, encoded_data5], axis = 1)
X.columns[X.isnull().any()]
X.fillna(0, inplace=True)

In [228]:
X.Rating = X.Rating.round(2)

In [232]:
X.to_csv('Preprocessed_Hotel_data.csv', index= False)