In [28]:
# importing packages...
import pandas as pd
import numpy as np

In [29]:
# loading data...
airbnb_df_original = pd.read_csv("airbnb_listings_austin-1.csv")

In [30]:
# show some entires to get a gist of our data
airbnb_df_original.loc[757:758]

Unnamed: 0,id,listing_url,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,...,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy
757,5423057,https://www.airbnb.com/rooms/5423057,SXSW Apartment Rental,Perfect one bedroom/one bath rental for SXSW (...,This space is a great place to combine the acc...,Perfect one bedroom/one bath rental for SXSW (...,none,The neighborhood is quiet and conveniently loc...,"If you have any specific questions, by all mea...","Bus stops are very close by. Uber, Lyft, and e...",...,0,,,,,,,,f,flexible
758,8799926,https://www.airbnb.com/rooms/8799926,Updated Home - Minutes to Downtown,"We live 4 minutes from downtown, 15 minutes fr...",,"We live 4 minutes from downtown, 15 minutes fr...",none,,,,...,0,,,,,,,,f,flexible


In [31]:
# drop unnecessary columns
airbnb_df = airbnb_df_original.drop(columns=['id', # just an id tag, not a predictor of price or booked
                                     'listing_url', # just a url link, not a predictor of price or booked
                                     'name', # name of apt
                                     'summary', # summary, nothing useful here
                                     'space', # a description
                                     'description', # a description
                                     'experiences_offered', # all rows have 'none'
                                     'neighborhood_overview', # a description
                                     'notes', # a description
                                     'transit', # a description
                                     'host_id', # host id tag, not a predictor
                                     'host_name', # name of host, not a predictor
                                     'host_location', # where the host lives, not a predictor
                                     'host_about', # a description
                                     'host_has_profile_pic', # all of them have profile pictures
                                     'amenities', # a description
                                     'square_feet', # very few observations had this measure
                                     'host_response_time', # lots of n/a values
                                     'city', # all of them are in austin mostly, so I think neighborhood is a better predictor than this
                                     'host_since', # might be useful, but I don't think that it is more helpful than the "superhost" tag -- experience doesn't always mean you're a better host
                                     'weekly_price', # only certain listings have a weekly price, but all have a nightly price, so I will use that instead
                                     'host_identity_verified' # Going out on a limb here, but I don't think this is more helpful than the "superhost" tag either
                                     ])

In [32]:
# show new data
airbnb_df.loc[757:758]

Unnamed: 0,host_response_rate,host_is_superhost,host_listings_count,neighbourhood,property_type,room_type,accommodates,bathrooms,bedrooms,beds,...,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy
757,,f,1.0,Parker Lane,Apartment,Entire home/apt,6,1.0,1.0,1.0,...,0,,,,,,,,f,flexible
758,,f,1.0,Parker Lane,House,Entire home/apt,6,1.5,4.0,3.0,...,0,,,,,,,,f,flexible


In [33]:
# List of columns to clean and convert to float
columns_to_clean = ['price', 'security_deposit', 'cleaning_fee', 'extra_people']

# Loop through each column to clean and convert
for column in columns_to_clean:
    airbnb_df[column] = (
        airbnb_df[column]
        .str.replace(',', '', regex=False)  # Remove commas
        .str.replace('$', '', regex=False)  # Remove dollar signs
        .astype(float)  # Convert to float
    )

In [34]:
# Clean host response rate (drop %) and convert to float
airbnb_df['host_response_rate'] = (
    airbnb_df['host_response_rate']
    .str.replace('%', '', regex=False)  # Remove the '%' symbol
    .astype(float)  # Convert to float
)

In [35]:
# fill these two columns with zeros, assuming that na means there is no cleaning fee/security deposit
airbnb_df['cleaning_fee'] = airbnb_df['cleaning_fee'].fillna(0)
airbnb_df['security_deposit'] = airbnb_df['security_deposit'].fillna(0)

In [36]:
# drop values for these columns that are null -- only a very small amount are
airbnb_df = airbnb_df.dropna(subset=['host_is_superhost','bathrooms', 'bedrooms', 'beds','neighbourhood'])

In [37]:
# show data types
airbnb_df.dtypes

host_response_rate             float64
host_is_superhost               object
host_listings_count            float64
neighbourhood                   object
property_type                   object
room_type                       object
accommodates                     int64
bathrooms                      float64
bedrooms                       float64
beds                           float64
bed_type                        object
price                          float64
security_deposit               float64
cleaning_fee                   float64
guests_included                  int64
extra_people                   float64
minimum_nights                   int64
has_availability                object
availability_30                  int64
availability_60                  int64
availability_90                  int64
availability_365                 int64
number_of_reviews                int64
review_scores_rating           float64
review_scores_accuracy         float64
review_scores_cleanliness

In [38]:
# Use the .sum() function get a count of the total number of missing values
missing_value_counts = airbnb_df.isna().sum()

# we can convert those numbers into percentages by dividing the counts by the number of records in
# the data (i.e., the length)
missing_value_percentages = missing_value_counts/len(airbnb_df) * 100
print(missing_value_percentages)

host_response_rate             28.688697
host_is_superhost               0.000000
host_listings_count             0.000000
neighbourhood                   0.000000
property_type                   0.000000
room_type                       0.000000
accommodates                    0.000000
bathrooms                       0.000000
bedrooms                        0.000000
beds                            0.000000
bed_type                        0.000000
price                           0.000000
security_deposit                0.000000
cleaning_fee                    0.000000
guests_included                 0.000000
extra_people                    0.000000
minimum_nights                  0.000000
has_availability                0.000000
availability_30                 0.000000
availability_60                 0.000000
availability_90                 0.000000
availability_365                0.000000
number_of_reviews               0.000000
review_scores_rating           33.592928
review_scores_ac

In [39]:
# i dont think that the host response rate and review scores will impact the price that the host sets, so I will drop these columns for my price regression model
airbnb_df = airbnb_df.drop(columns=['host_response_rate','review_scores_rating','review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value'
                                     ])

In [40]:
# Use the .sum() function get a count of the total number of missing values
missing_value_counts = airbnb_df.isna().sum()

# we can convert those numbers into percentages by dividing the counts by the number of records in
# the data (i.e., the length)
missing_value_percentages = missing_value_counts/len(airbnb_df) * 100
print(missing_value_percentages)

host_is_superhost      0.0
host_listings_count    0.0
neighbourhood          0.0
property_type          0.0
room_type              0.0
accommodates           0.0
bathrooms              0.0
bedrooms               0.0
beds                   0.0
bed_type               0.0
price                  0.0
security_deposit       0.0
cleaning_fee           0.0
guests_included        0.0
extra_people           0.0
minimum_nights         0.0
has_availability       0.0
availability_30        0.0
availability_60        0.0
availability_90        0.0
availability_365       0.0
number_of_reviews      0.0
instant_bookable       0.0
cancellation_policy    0.0
dtype: float64


In [41]:
# Exporting the DataFrame to answer our first question (predictors of price) to a CSV file
airbnb_df.to_csv('airbnb_data_clean.csv', index=False)