In [3]:
import pandas as pd
import re
import numpy as np
import json
from sklearn.preprocessing import OneHotEncoder

In [4]:
# load the data
def load_data():
    df = pd.read_csv("listings.csv.gz", compression='gzip')
    return df
df = load_data()
print(df)

                       id                                       listing_url  \
0                  164448               https://www.airbnb.com/rooms/164448   
1                  220851               https://www.airbnb.com/rooms/220851   
2                  238411               https://www.airbnb.com/rooms/238411   
3                  242188               https://www.airbnb.com/rooms/242188   
4                  273906               https://www.airbnb.com/rooms/273906   
...                   ...                                               ...   
5218  1321121257587188985  https://www.airbnb.com/rooms/1321121257587188985   
5219  1321592072491619841  https://www.airbnb.com/rooms/1321592072491619841   
5220  1321811206140917325  https://www.airbnb.com/rooms/1321811206140917325   
5221  1321863150641245338  https://www.airbnb.com/rooms/1321863150641245338   
5222  1321949733901820355  https://www.airbnb.com/rooms/1321949733901820355   

           scrape_id last_scraped       source  \
0

In [None]:
# data cleaning
def clean_data(df):
    
    df_clean = df.copy()
    # convert price $ to numeric
    df["price"] = df["price"].replace("[\$,]", "", regex=True).astype(float)
    
    # handle missing values for numerical columns
    numeric_cols = ["bathrooms", "bedrooms", "beds", "accommodates", "minimum_nights"]
    for col in numeric_cols:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())

    # process amenities column (convert to list)
    df_clean["amenities_list"] = df_clean["amenities"].apply(parse_amenities)

    common_amenities = [
        "Wifi", "Kitchen", "Heating", "Air conditioning", "Washer", "Dryer", 
        "TV", "Hair dryer", "Iron", "Smoke alarm", "Fire extinguisher", 
        "Dishwasher", "Refrigerator", "Microwave", "Oven", "Stove", "Coffee maker",
        "Hot water", "Elevator", "Free parking"
    ]

    for amenity in common_amenities:
        df_clean[f"has_{amenity.lower().replace('', '_')}"] = df_clean["amenities_list"].apply(lambda x: 1 if amenity in x else 0)

    # extract room features
    df_clean["is_entire_home"] = df_clean["room_type"].apply(lambda x:1 if x == "Entire home/apt" else 0)
    df_clean["is_private_room"] = df_clean["room_type"].apply(lambda x:1 if x == "Private room" else 0)

    if "name" in df_clean.columns:
        df_clean["title_word_count"] = df_clean["name"].fillna("").apply(lambda x:len(str(x).split()))
        df_clean["title_length"] = df_clean["name"].fillna("").apply(len)
    
    if "description" in df_clean.columns:
        df_clean["description_word_count"] = df_clean["description"].fillna("").apply(lambda x: len(str(x).split()))
    
    # add review score features
    review_score_cols = [
        "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness",
        "review_scores_checkin", "review_scores_communication", "review_scores_location", 
        "review_scores_value"
    ]

    for col in review_score_cols:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())

    # superhost feature
    if "host_is_superhost" in df_clean.columns:
        df_clean["host_is_superhost_num"] = df_clean["host_is_superhost"].apply(lambda x: 1 if x == "t" else 0)

    # check if instant bookable
    if "instant_bookable" in df_clean.columns:
        df_clean["instant_bookable_num"] = df_clean["instant_bookable"].apply(lambda x: 1 if x == "t" else 0)

    # add review count and frequency features
    if "number_of_reviews" in df_clean.columns:
        df_clean["number_of_reviews"] = df_clean["number_of_reviews"].fillna(0)
        df_clean["has_reviews"] = df_clean["number_of_reviews"].apply(lambda x: 1 if x > 0 else 0)

    if "reviews_per_month" in df_clean.columns:
        df_clean["reviews_per_monthn"] = df_clean["reviews_per_month"].fillna(0)

    return df_clean

In [None]:
def parse_amenities(amenities):
    if pd.isna(amenities):
        return []
    
    try:
        items = re.findall(r'"([^"]*)', amenities)
        return items
    except:
        try:
            cleaned_str = amenities.replace("\\", "")
            if cleaned_str.startswith("[") and cleaned_str.endswith("]"):
                return json.loads(clean_data)
    
        except:
            return []
    
    return []

In [8]:
# show the clean data
df_clean = clean_data(df)

TypeError: '>' not supported between instances of 'str' and 'int'

In [7]:
#print(df_clean)
#df_clean["amenities"]
enc = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
all_amenities = set(amenity for sublist in df_clean['amenities'] for amenity in sublist)
all_amenities_list = np.array(list(all_amenities)[0:4]).reshape(-1, 1)
enc.fit_transform(all_amenities_list)
# print(enc.get_feature_names_out())
#df_clean["amenities"]
enc.transform([["Hair dryer"]]).toarray()
all_amenities_list 

array([['&'],
       ['3'],
       ['/'],
       ['2']], dtype='<U1')