### Feature Engineering

In [249]:
import pandas as pd

In [250]:
df = pd.read_csv('../data/processed/Airbnb_Data.csv')

In [251]:
df.columns

Index(['log_price', 'property_type', 'room_type', 'amenities', 'accommodates',
       'bathrooms', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'instant_bookable', 'latitude', 'longitude', 'number_of_reviews',
       'review_scores_rating', 'bedrooms', 'beds'],
      dtype='object')

In [252]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   log_price               74111 non-null  float64
 1   property_type           74111 non-null  object 
 2   room_type               74111 non-null  object 
 3   amenities               74111 non-null  object 
 4   accommodates            74111 non-null  int64  
 5   bathrooms               74111 non-null  float64
 6   bed_type                74111 non-null  object 
 7   cancellation_policy     74111 non-null  object 
 8   cleaning_fee            74111 non-null  bool   
 9   city                    74111 non-null  object 
 10  host_has_profile_pic    74111 non-null  object 
 11  host_identity_verified  74111 non-null  object 
 12  host_response_rate      74111 non-null  float64
 13  instant_bookable        74111 non-null  bool   
 14  latitude                74111 non-null

## One-Hot Encoding 

In [253]:
df = pd.get_dummies(df, columns=['property_type'], drop_first=True)

In [254]:
df = pd.get_dummies(df, columns=['room_type'], drop_first=True)

In [255]:
df = pd.get_dummies(df, columns=['bed_type'], drop_first=True)

In [256]:
df = pd.get_dummies(df, columns=['cancellation_policy'], drop_first=True)

In [257]:
df = pd.get_dummies(df, columns=['city'], drop_first=True)

In [258]:
df.columns

Index(['log_price', 'amenities', 'accommodates', 'bathrooms', 'cleaning_fee',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'instant_bookable', 'latitude', 'longitude', 'number_of_reviews',
       'review_scores_rating', 'bedrooms', 'beds',
       'property_type_Bed & Breakfast', 'property_type_Boat',
       'property_type_Boutique hotel', 'property_type_Bungalow',
       'property_type_Cabin', 'property_type_Camper/RV',
       'property_type_Casa particular', 'property_type_Castle',
       'property_type_Cave', 'property_type_Chalet',
       'property_type_Condominium', 'property_type_Dorm',
       'property_type_Earth House', 'property_type_Guest suite',
       'property_type_Guesthouse', 'property_type_Hostel',
       'property_type_House', 'property_type_Hut', 'property_type_In-law',
       'property_type_Island', 'property_type_Lighthouse',
       'property_type_Loft', 'property_type_Other',
       'property_type_Parking Space', 'property_ty

## Label Enconding

In [259]:
# Pasar a 0 y 1 
df['host_has_profile_pic'] = df['host_has_profile_pic'].map({'t': 1, 'f': 0, '-1': -1})

In [260]:
df['host_identity_verified'] = df['host_identity_verified'].map({'t': 1, 'f': 0, '-1': -1})

In [261]:
df['cleaning_fee'] = df['cleaning_fee'].map({True: 1, False: 0})

In [262]:
df['instant_bookable'] = df['instant_bookable'].map({True: 1, False: 0})

In [263]:
print(df['host_has_profile_pic'].value_counts())
print(df['host_identity_verified'].value_counts())
print(df['cleaning_fee'].value_counts())
print(df['instant_bookable'].value_counts())

host_has_profile_pic
 1    73697
 0      226
-1      188
Name: count, dtype: int64
host_identity_verified
 1    49748
 0    24175
-1      188
Name: count, dtype: int64
cleaning_fee
1    54403
0    19708
Name: count, dtype: int64
instant_bookable
0    54660
1    19451
Name: count, dtype: int64


## Extracción de datos

In [264]:
import re
from sklearn.preprocessing import MultiLabelBinarizer
def limpiar_amenities(amenities):
    if pd.isna(amenities) or amenities.strip() == "":
        return ""
    
    # Eliminar llaves y comillas
    amenities = re.sub(r'[{}"]', '', amenities)

    # Reemplazar valores extraños
    amenities = re.sub(r'translation missing: en\.(hosting_amenity_\d+)', r'\1', amenities)

    # Eliminar dobles comas
    amenities = re.sub(r',\s*,', ',', amenities).strip(',')

    # Eliminar espacios extra
    amenities = re.sub(r'\s+', ' ', amenities)

    # Eliminar valores vacios
    amenities = ','.join([amenity for amenity in amenities.split(',') if amenity.strip() != ""])

    return amenities

In [265]:
df['clean_amenities'] = df['amenities'].apply(limpiar_amenities)

In [266]:
df['clean_amenities'].unique()

array(['Wireless Internet,Air conditioning,Kitchen,Heating,Family/kid friendly,Essentials,Hair dryer,Iron,hosting_amenity_50',
       'Wireless Internet,Air conditioning,Kitchen,Heating,Family/kid friendly,Washer,Dryer,Smoke detector,Fire extinguisher,Essentials,Shampoo,Hangers,Hair dryer,Iron,hosting_amenity_50',
       'TV,Cable TV,Wireless Internet,Air conditioning,Kitchen,Breakfast,Buzzer/wireless intercom,Heating,Family/kid friendly,Smoke detector,Carbon monoxide detector,Fire extinguisher,Essentials,Shampoo,Hangers,Hair dryer,Iron,Laptop friendly workspace,hosting_amenity_50',
       ...,
       'TV,Cable TV,Internet,Wireless Internet,Kitchen,Free parking on premises,Heating,Family/kid friendly,Washer,Dryer,Smoke detector,Essentials,Shampoo,Hangers,Hair dryer,hosting_amenity_50',
       'TV,Internet,Wireless Internet,Air conditioning,Kitchen,Gym,Elevator,Buzzer/wireless intercom,Heating,Family/kid friendly,Washer,Dryer,Smoke detector,Carbon monoxide detector,First aid kit,Essenti

In [267]:
# Contar las amenities
df["amenities_count"] = df["amenities"].apply(lambda x: len(x.split(',')))

In [268]:
# Convertir las cadenas de texto en listas de amenities
df['clean_amenities'] = df['clean_amenities'].apply(lambda x: x.split(','))


In [269]:
# Inicializar MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Aplicar el One-Hot Encoding
amenities_encoded = pd.DataFrame(mlb.fit_transform(df['clean_amenities']), columns=mlb.classes_)
amenities_encoded = amenities_encoded.loc[:, amenities_encoded.columns != '']
amenities_encoded.columns = amenities_encoded.columns.str.strip()

# Combinar los valores de ambas columnas, tomando el valor de una si está presente
amenities_encoded['Wide clearance to shower and toilet'] = amenities_encoded['Wide clearance to shower and toilet'].fillna(amenities_encoded['Wide clearance to shower & toilet'])

# Eliminar la columna redundante
amenities_encoded = amenities_encoded.drop(columns=['Wide clearance to shower & toilet'])

# Concatenar el DataFrame original con las nuevas columnas One-Hot encoded
#df = pd.concat([df, amenities_encoded], axis=1)

In [270]:
amenities_encoded.shape

(74111, 129)

In [276]:
amenities_encoded.columns

Index(['smooth pathway to front door', '24-hour check-in',
       'Accessible-height bed', 'Accessible-height toilet', 'Air conditioning',
       'Air purifier', 'BBQ grill', 'Baby bath', 'Baby monitor',
       'Babysitter recommendations',
       ...
       'Wheelchair accessible', 'Wide clearance to bed',
       'Wide clearance to shower and toilet', 'Wide doorway', 'Wide entryway',
       'Wide hallway clearance', 'Window guards', 'Wireless Internet',
       'hosting_amenity_49', 'hosting_amenity_50'],
      dtype='object', length=129)

In [310]:
# Suma de los amenities 'Accessible-height bed', 'Accessible-height toilet', 'Fixed grab bars for shower & toilet', 'Grab-rails for shower and toilet', 'Roll-in shower with chair', 'Wheelchair accessible', 'Wide clearance to bed', 'Wide clearance to shower and toilet', 'Wide doorway', 'Wide entryway', 'Step-free access', 'Wide hallway clearance'
df['accessible_amenities'] = amenities_encoded[['Accessible-height bed', 'Accessible-height toilet', 'Fixed grab bars for shower & toilet', 'Grab-rails for shower and toilet', 'Roll-in shower with chair', 'Wheelchair accessible', 'Wide clearance to bed', 'Wide clearance to shower and toilet', 'Wide doorway', 'Wide entryway', 'Step-free access', 'Wide hallway clearance']].sum(axis=1)
amenities = {'accessible_amenities': ['Accessible-height bed', 'Accessible-height toilet', 'Fixed grab bars for shower & toilet', 'Grab-rails for shower and toilet', 'Roll-in shower with chair', 'Wheelchair accessible', 'Wide clearance to bed', 'Wide clearance to shower and toilet', 'Wide doorway', 'Wide entryway', 'Step-free access', 'Wide hallway clearance']}

In [311]:
df['accessible_amenities'].value_counts()

accessible_amenities
0    68288
1     5082
2      201
3      135
4      118
5      100
6       92
7       71
8       22
9        2
Name: count, dtype: int64

In [312]:
# suma de las amenities 'Bath towel', 'Bathtub', 'Bathtub with shower chair', 'Body soap', 'Shampoo', 'Hand or paper towel', 'Hand soap', 'Toilet paper', 'Fireplace guards'
df['bathroom_amenities'] = amenities_encoded[['Bath towel', 'Bathtub', 'Bathtub with shower chair', 'Body soap', 'Shampoo', 'Hand or paper towel', 'Hand soap', 'Toilet paper', 'Fireplace guards']].sum(axis=1)
amenities['bathroom_amenities'] = ['Bath towel', 'Bathtub', 'Bathtub with shower chair', 'Body soap', 'Shampoo', 'Hand or paper towel', 'Hand soap', 'Toilet paper', 'Fireplace guards']

In [285]:
df['bathroom_amenities'].value_counts()

bathroom_amenities
1    47088
0    23901
2     2975
3      144
4        2
6        1
Name: count, dtype: int64

In [313]:
# suma de amenities 'Air conditioning', 'Heating', 'Air purifier'
df['climate_amenities'] = amenities_encoded[['Air conditioning', 'Heating', 'Air purifier']].sum(axis=1)
amenities['climate_amenities'] = ['Air conditioning', 'Heating', 'Air purifier']

In [289]:
df['climate_amenities'].value_counts()

climate_amenities
2    52468
1    17328
0     4304
3       11
Name: count, dtype: int64

In [314]:
# Suma de 'Cat(s)', 'Dog(s)', 'Other pet(s)', 'Pets allowed', 'Pets live on this property'
df['pet_amenities'] = amenities_encoded[['Cat(s)', 'Dog(s)', 'Other pet(s)', 'Pets allowed', 'Pets live on this property']].sum(axis=1)
amenities['pet_amenities'] = ['Cat(s)', 'Dog(s)', 'Other pet(s)', 'Pets allowed', 'Pets live on this property']

In [291]:
df['pet_amenities'].value_counts()

pet_amenities
0    56391
1     9404
2     5700
3     2171
4      389
5       56
Name: count, dtype: int64

In [315]:
# suma amenities 'Essentials', 'Internet', 'Wireless Internet', 'Smoke detector', 'Fire extinguisher', 'Shampoo', 'Iron', 'Hair dryer', 'Shower', 'Luggage dropoff allowed', 'Free parking on premises', 'Free parking on street', 'Cable TV', 'Washer / Dryer'
df['basic_amenities'] = amenities_encoded[['Essentials', 'Internet', 'Wireless Internet', 'Smoke detector', 'Fire extinguisher', 'Shampoo', 'Iron', 'Hair dryer', 'Luggage dropoff allowed', 'Free parking on premises', 'Free parking on street', 'Cable TV', 'Washer / Dryer']].sum(axis=1)
amenities['basic_amenities'] = ['Essentials', 'Internet', 'Wireless Internet', 'Smoke detector', 'Fire extinguisher', 'Shampoo', 'Iron', 'Hair dryer', 'Luggage dropoff allowed', 'Free parking on premises', 'Free parking on street', 'Cable TV', 'Washer / Dryer']

In [302]:
df['basic_amenities'].value_counts()

basic_amenities
7     12280
6     11825
8     11122
5     10289
9      8021
4      7969
3      5019
10     3388
2      2359
0      1099
1       670
11       70
Name: count, dtype: int64

In [316]:
# suma amenities 'Carbon monoxide detector', 'First aid kit', 'Fireplace guards', 'Smart lock', 'Keypad', 'Lock on bedroom door', 'Smoke detector', 'Safety card', 'Self Check-In'
df['safety_amenities'] = amenities_encoded[['Carbon monoxide detector', 'First aid kit', 'Fireplace guards', 'Smart lock', 'Keypad', 'Lock on bedroom door', 'Smoke detector', 'Safety card', 'Self Check-In']].sum(axis=1)
amenities['safety_amenities'] = ['Carbon monoxide detector', 'First aid kit', 'Fireplace guards', 'Smart lock', 'Keypad', 'Lock on bedroom door', 'Smoke detector', 'Safety card', 'Self Check-In']

In [304]:
df['safety_amenities'].value_counts()

safety_amenities
2    20042
3    16926
4    10958
0    10224
1     8981
5     5086
6     1526
7      348
8       20
Name: count, dtype: int64

In [317]:
# suma amenities 'BBQ grill', 'Coffee maker', 'Cooking basics', 'Dishwasher', 'Microwave', 'Oven', 'Stove', 'Refrigerator', 'Dishes and silverware', 'Iron', 'Hot water kettle'
df['kitchen_amenities'] = amenities_encoded[['BBQ grill', 'Coffee maker', 'Cooking basics', 'Dishwasher', 'Microwave', 'Oven', 'Stove', 'Refrigerator', 'Dishes and silverware', 'Iron', 'Hot water kettle']].sum(axis=1)
amenities['kitchen_amenities'] = ['BBQ grill', 'Coffee maker', 'Cooking basics', 'Dishwasher', 'Microwave', 'Oven', 'Stove', 'Refrigerator', 'Dishes and silverware', 'Iron', 'Hot water kettle']

In [297]:
df['kitchen_amenities'].value_counts()

kitchen_amenities
1     37886
0     31492
9      1342
8      1091
7       806
6       445
10      318
5       287
4       231
3       112
2        92
11        9
Name: count, dtype: int64

In [318]:
# suma amenities 'Garden or backyard', 'Patio or balcony', 'Beachfront', 'Waterfront', 'Lake access', 'Ski in/Ski out'
df['outdoor_amenities'] = amenities_encoded[['Garden or backyard', 'Patio or balcony', 'Beachfront', 'Waterfront', 'Lake access', 'Ski in/Ski out']].sum(axis=1)
amenities['outdoor_amenities'] = ['Garden or backyard', 'Patio or balcony', 'Beachfront', 'Waterfront', 'Lake access', 'Ski in/Ski out']

In [299]:
df['outdoor_amenities'].value_counts()

outdoor_amenities
0    72236
1     1262
2      581
3       25
4        6
5        1
Name: count, dtype: int64

In [319]:
# suma amenities 'TV', 'Cable TV', 'Game console', 'Smartlock', 'Pocket wifi', 'Laptop friendly workspace', 'Buzzer/wireless intercom', 'Internet'
df['entertainment_amenities'] = amenities_encoded[['TV', 'Cable TV', 'Game console', 'Smartlock', 'Pocket wifi', 'Laptop friendly workspace', 'Buzzer/wireless intercom', 'Internet']].sum(axis=1)
amenities['entertainment_amenities'] = ['TV', 'Cable TV', 'Game console', 'Smartlock', 'Pocket wifi', 'Laptop friendly workspace', 'Buzzer/wireless intercom', 'Internet']

In [301]:
df['entertainment_amenities'].value_counts()

entertainment_amenities
2    19312
3    17277
4    14073
1    13505
0     5343
5     4413
6      183
7        5
Name: count, dtype: int64

In [336]:
# suma amenities 'Family/kid friendly', 'Host greets you', 'Luggage dropoff allowed', 'Long term stays allowed', 'Pets live on this property', 'Paid parking off premises', 'Private bathroom', 'Private entrance', 'Private living room'
df['family_amenities'] = amenities_encoded[['Family/kid friendly', 'Host greets you', 'Luggage dropoff allowed', 'Long term stays allowed', 'Pets live on this property', 'Paid parking off premises', 'Private bathroom', 'Private entrance', 'Private living room']].sum(axis=1)
amenities['family_amenities'] = ['Family/kid friendly', 'Host greets you', 'Luggage dropoff allowed', 'Long term stays allowed', 'Pets live on this property', 'Paid parking off premises', 'Private bathroom', 'Private entrance', 'Private living room']

In [337]:
# Mira cuantos valores hay en el diccionario amenities
total_amenities = 0
amenities_list = []
for key in amenities:
    total_amenities += len(amenities[key])
    amenities_list += amenities[key]
    print(f'{key}: {len(amenities[key])}')
print(f'Total: {total_amenities}')

accessible_amenities: 12
bathroom_amenities: 9
climate_amenities: 3
pet_amenities: 5
basic_amenities: 13
safety_amenities: 9
kitchen_amenities: 11
outdoor_amenities: 6
entertainment_amenities: 8
family_amenities: 9
Total: 85


In [339]:
len(amenities_list)

85

In [340]:
amenities_columns = [
    "smooth pathway to front door", "24-hour check-in", "Accessible-height bed", "Accessible-height toilet",
    "Air conditioning", "Air purifier", "BBQ grill", "Baby bath", "Baby monitor",
    "Babysitter recommendations", "Bath towel", "Bathtub", "Bathtub with shower chair",
    "Beach essentials", "Beachfront", "Bed linens", "Body soap", "Breakfast",
    "Buzzer/wireless intercom", "Cable TV", "Carbon monoxide detector", "Cat(s)",
    "Changing table", "Children’s books and toys", "Children’s dinnerware",
    "Cleaning before checkout", "Coffee maker", "Cooking basics", "Crib",
    "Disabled parking spot", "Dishes and silverware", "Dishwasher", "Dog(s)", "Doorman",
    "Doorman Entry", "Dryer", "EV charger", "Elevator", "Elevator in building",
    "Essentials", "Ethernet connection", "Extra pillows and blankets",
    "Family/kid friendly", "Fire extinguisher", "Fireplace guards", "Firm matress",
    "Firm mattress", "First aid kit", "Fixed grab bars for shower & toilet", "Flat",
    "Flat smooth pathway to front door", "Free parking on premises",
    "Free parking on street", "Game console", "Garden or backyard",
    "Grab-rails for shower and toilet", "Ground floor access", "Gym", "Hair dryer",
    "Hand or paper towel", "Hand soap", "Handheld shower head", "Hangers", "Heating",
    "High chair", "Host greets you", "Hot tub", "Hot water", "Hot water kettle",
    "Indoor fireplace", "Internet", "Iron", "Keypad", "Kitchen", "Lake access",
    "Laptop friendly workspace", "Lock on bedroom door", "Lockbox",
    "Long term stays allowed", "Luggage dropoff allowed", "Microwave", "Other",
    "Other pet(s)", "Outlet covers", "Oven", "Pack ’n Play/travel crib",
    "Paid parking off premises", "Path to entrance lit at night", "Patio or balcony",
    "Pets allowed", "Pets live on this property", "Pocket wifi", "Pool",
    "Private bathroom", "Private entrance", "Private living room", "Refrigerator",
    "Roll-in shower with chair", "Room-darkening shades", "Safety card",
    "Self Check-In", "Shampoo", "Single level home", "Ski in/Ski out", "Smart lock",
    "Smartlock", "Smoke detector", "Smoking allowed", "Stair gates",
    "Step-free access", "Stove", "Suitable for events", "TV", "Table corner guards",
    "Toilet paper", "Washer", "Washer / Dryer", "Waterfront",
    "Well-lit path to entrance", "Wheelchair accessible", "Wide clearance to bed",
    "Wide clearance to shower and toilet", "Wide doorway", "Wide entryway",
    "Wide hallway clearance", "Window guards", "Wireless Internet",
    "hosting_amenity_49", "hosting_amenity_50"
]

In [341]:
print(len(amenities_columns))
print(len(set(amenities_columns)))
print(len(amenities_list))

129
129
85


In [344]:
# compara amenities_list y amenities_columns y dame una lista de los que no estan en amenities_columns
missing_amenities = list(set(amenities_columns) - set(amenities_list))

In [346]:
missing_amenities

['Outlet covers',
 'Stair gates',
 'Window guards',
 'Dryer',
 'Baby monitor',
 'Firm matress',
 'Elevator',
 'Other',
 'Disabled parking spot',
 'Indoor fireplace',
 'Smoking allowed',
 'Pack ’n Play/travel crib',
 'Room-darkening shades',
 '24-hour check-in',
 'Extra pillows and blankets',
 'Hangers',
 'Doorman Entry',
 'Flat',
 'Flat smooth pathway to front door',
 'Path to entrance lit at night',
 'Hot tub',
 'Pool',
 'Well-lit path to entrance',
 'Baby bath',
 'Gym',
 'Washer',
 'hosting_amenity_50',
 'Elevator in building',
 'Changing table',
 'Handheld shower head',
 'Children’s dinnerware',
 'Crib',
 'Cleaning before checkout',
 'Table corner guards',
 'Ethernet connection',
 'Single level home',
 'Suitable for events',
 'Doorman',
 'Hot water',
 'High chair',
 'smooth pathway to front door',
 'Children’s books and toys',
 'Firm mattress',
 'Kitchen',
 'Babysitter recommendations',
 'Ground floor access',
 'Bed linens',
 'Beach essentials',
 'EV charger',
 'Lockbox',
 'hosting_