In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [4]:
bnb_listings = pd.read_csv('data/listings.csv')

#Clean up price column
bnb_listings['price'].fillna(0, inplace=True)

#Function to convert string money to float money
def money_str_to_float(x):
    if type(x) == str:
        x = x.replace('$','')
        x = x.replace(',','')
        return float(x)
    elif type(x) == int:
        return float(x)

In [5]:
bnb_listings['neighborhood_overview'][1]

'Queen Anne is a wonderful, truly functional village.  You can walk everywhere... for coffee, for groceries, for a night out, for breathtaking views of the Puget Sound.'

In [6]:
bnb_listings_master = bnb_listings[['id', #DO NOT DELETE. USEFUL FOR REFERENCING THE LIST OF COLUMNS
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'calendar_updated',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'calendar_last_scraped',
 'number_of_reviews',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'requires_license',
 'license',
 'jurisdiction_names',
 'instant_bookable',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification',
 'calculated_host_listings_count',
 'reviews_per_month']]

In [4]:
bnb_listings_abbrev = bnb_listings[['id',
 'host_listings_count',
 'host_total_listings_count',
 'street',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bed_type',
 'amenities',
 'price',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'has_availability',
 'availability_30',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'reviews_per_month']]

In [5]:
#Clean the DataFrame
bnb_listings_abbrev.dropna(inplace=True)
bnb_listings_abbrev['price'] = bnb_listings_abbrev['price'].apply(lambda x: money_str_to_float(x))
bnb_listings_abbrev['earnings_per_mo'] = bnb_listings_abbrev['price'] * (30 - bnb_listings_abbrev['availability_30']) #

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bnb_listings_abbrev['price'] = bnb_listings_abbrev['price'].apply(lambda x: money_str_to_float(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bnb_listings_abbrev['earnings_per_mo'] = bnb_listings_abbrev['price'] * (30 - bnb_listings_abbrev['availability_30'])


In [6]:
def generate_collinearity_pairs(x):

    # save absolute value of correlation matrix as a data frame
    # converts all values to absolute value
    # stacks the row:column pairs into a multindex
    # reset the index to set the multindex to seperate columns
    # sort values. 0 is the column automatically generated by the stacking

    df=x.corr().abs().stack().reset_index().sort_values(0, ascending=False)

    # zip the variable name columns (Which were only named level_0 and level_1 by default) in a new column named "pairs"
    df['pairs'] = list(zip(df.level_0, df.level_1))

    # set index to pairs
    df.set_index(['pairs'], inplace = True)

    #d rop level columns
    df.drop(columns=['level_1', 'level_0'], inplace = True)

    # rename correlation column as cc rather than 0
    df.columns = ['cc']

    # drop duplicates. This could be dangerous if you have variables perfectly correlated with variables other than themselves.
    # for the sake of exercise, kept it in.
    df.drop_duplicates(inplace=True)

    return df

In [7]:
df = generate_collinearity_pairs(bnb_listings_abbrev)

#drop a variable from each highly collinear pair
df[(df.cc>.75) & (df.cc <1)]

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1


In [8]:
#Ran once to update earnings_data.csv
#bnb_listings_abbrev.to_csv('data/earnings_data.csv')

In [26]:
synthetic_mortgages_df = bnb_listings_master[[
 'bedrooms',
 'bathrooms',
 'zipcode',
 'host_listings_count',
 'neighbourhood_cleansed',
 'zipcode',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'price',
 'guests_included',
 'minimum_nights',
 'maximum_nights',
 'availability_30',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_communication',
 'review_scores_location',
 'reviews_per_month']];

In [27]:
def encode_cat(df_series):
    lb_maker = LabelEncoder()
    cat_encoded = lb_maker.fit_transform(df_series)
    return cat_encoded

In [28]:
encode_list = ['property_type','room_type','neighbourhood_cleansed']

for item in encode_list:
    synthetic_mortgages_df[item] = pd.Categorical(encode_cat(synthetic_mortgages_df[item]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synthetic_mortgages_df[item] = pd.Categorical(encode_cat(synthetic_mortgages_df[item]))


In [32]:
synthetic_mortgages_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [35]:
synthetic_mortgages_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3136 entries, 0 to 3814
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   bedrooms                     3136 non-null   float64 
 1   bathrooms                    3136 non-null   float64 
 2   zipcode                      3136 non-null   object  
 3   host_listings_count          3136 non-null   float64 
 4   neighbourhood_cleansed       3136 non-null   category
 5   zipcode                      3136 non-null   object  
 6   latitude                     3136 non-null   float64 
 7   longitude                    3136 non-null   float64 
 8   property_type                3136 non-null   category
 9   room_type                    3136 non-null   category
 10  accommodates                 3136 non-null   int64   
 11  price                        3136 non-null   object  
 12  guests_included              3136 non-null   int64   
 13  min

In [36]:
df_test = pd.read_csv('data/earnings_data.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3152 entries, 0 to 3151
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    3152 non-null   int64  
 1   id                            3152 non-null   int64  
 2   host_listings_count           3152 non-null   float64
 3   host_total_listings_count     3152 non-null   float64
 4   street                        3152 non-null   object 
 5   neighbourhood_cleansed        3152 non-null   object 
 6   neighbourhood_group_cleansed  3152 non-null   object 
 7   city                          3152 non-null   object 
 8   state                         3152 non-null   object 
 9   zipcode                       3152 non-null   object 
 10  market                        3152 non-null   object 
 11  smart_location                3152 non-null   object 
 12  country_code                  3152 non-null   object 
 13  cou