In [13]:
import pandas as pd
import re
import numpy as np
import json
from sklearn.preprocessing import OneHotEncoder

In [15]:
# load the data
def load_data():
    df = pd.read_csv("listings.csv.gz", compression='gzip')
    return df
df = load_data()
print(df)

                       id                                       listing_url  \
0                  164448               https://www.airbnb.com/rooms/164448   
1                  220851               https://www.airbnb.com/rooms/220851   
2                  238411               https://www.airbnb.com/rooms/238411   
3                  242188               https://www.airbnb.com/rooms/242188   
4                  273906               https://www.airbnb.com/rooms/273906   
...                   ...                                               ...   
5218  1321121257587188985  https://www.airbnb.com/rooms/1321121257587188985   
5219  1321592072491619841  https://www.airbnb.com/rooms/1321592072491619841   
5220  1321811206140917325  https://www.airbnb.com/rooms/1321811206140917325   
5221  1321863150641245338  https://www.airbnb.com/rooms/1321863150641245338   
5222  1321949733901820355  https://www.airbnb.com/rooms/1321949733901820355   

           scrape_id last_scraped       source  \
0

In [16]:
# data cleaning
def clean_data(df):
    
    df_clean = df.copy()
    # convert price $ to numeric
    df["price"] = df["price"].replace("[\$,]", "", regex=True).astype(float)
    
    # handle missing values for numerical columns
    numeric_cols = ["bathrooms", "bedrooms", "beds", "accommodates", "minimum_nights"]
    for col in numeric_cols:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())

    # process amenities column (convert to list)
    df_clean["amenities_list"] = df_clean["amenities"].apply(parse_amenities)

    common_amenities = [
        "Wifi", "Kitchen", "Heating", "Air conditioning", "Washer", "Dryer", 
        "TV", "Hair dryer", "Iron", "Smoke alarm", "Fire extinguisher", 
        "Dishwasher", "Refrigerator", "Microwave", "Oven", "Stove", "Coffee maker",
        "Hot water", "Elevator", "Free parking"
    ]

    for amenity in common_amenities:
        df_clean[f"has_{amenity.lower().replace('', '_')}"] = df_clean["amenities_list"].apply(lambda x: 1 if amenity in x else 0)

    # extract room features
    df_clean["is_entire_home"] = df_clean["room_type"].apply(lambda x:1 if x == "Entire home/apt" else 0)
    df_clean["is_private_room"] = df_clean["room_type"].apply(lambda x:1 if x == "Private room" else 0)

    if "name" in df_clean.columns:
        df_clean["title_word_count"] = df_clean["name"].fillna("").apply(lambda x:len(str(x).split()))
        df_clean["title_length"] = df_clean["name"].fillna("").apply(len)
    
    if "description" in df_clean.columns:
        df_clean["description_word_count"] = df_clean["description"].fillna("").apply(lambda x: len(str(x).split()))
    

    return df_clean


In [17]:
def parse_amenities(amenities):
    if pd.isna(amenities):
        return []
    
    try:
        cleaned_str = amenities.replace("\\", "")
        if cleaned_str.startswith("[") and cleaned_str.endswith("]"):
            return json.loads(clean_data)
    
    except:
        pass

    try:
        items = re.findall(r'"([^"]*)', amenities)
        return items
    except:
        return []
    

In [22]:
df_clean = clean_data(df)
df_clean["description"]

0       I am renting out a nice double room on the top...
1                                                Welcome!
2                                                     NaN
3       I am renting out a nice single room on the top...
4                                                     NaN
                              ...                        
5218    Cozy and fresh three with two bedrooms, open p...
5219    This unique accommodation of 22 sqm, has large...
5220    Green unique area with the first pedestrian st...
5221    Bring the whole family to this amazing place w...
5222    Newly built second floor in 2020 of 32 sqm in ...
Name: description, Length: 5223, dtype: object

In [7]:
#print(df_clean)
#df_clean["amenities"]
enc = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
all_amenities = set(amenity for sublist in df_clean['amenities'] for amenity in sublist)
all_amenities_list = np.array(list(all_amenities)[0:4]).reshape(-1, 1)
enc.fit_transform(all_amenities_list)
# print(enc.get_feature_names_out())
#df_clean["amenities"]
enc.transform([["Hair dryer"]]).toarray()
all_amenities_list 

array([['&'],
       ['3'],
       ['/'],
       ['2']], dtype='<U1')