# Setup and Utilties

In [2]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing
import scipy

import ast
import collections

import matplotlib as plt
import seaborn as sb

In [3]:
# Load data from a file to a pandas dataframe
def load_data(state='ca', city='los-angeles'):
    datasource = f"../../project_data/inside_airbnb/united-states/{state}/{city}/listings.csv"
    return pd.read_csv(datasource)

# Remove columns not used for models.
# Format numerical string data into numerical datatypes
# Format t/f into binary representation
def clean_and_drop(df, drop_price=False, drop_amenities=True):
    df = df.drop(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
                'neighborhood_overview', 'neighbourhood_group_cleansed', 'picture_url', 'host_id', 'host_url',
                'host_name', 'host_since', 'host_location', 'host_about', 'host_thumbnail_url', 'host_picture_url',
                'host_neighbourhood', 'neighbourhood', 'bathrooms', 
                'calendar_last_scraped', 'first_review', 'last_review', 'license', 'host_verifications', 'calendar_updated'], axis="columns")

    df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype('float') / 100.0
    df['host_acceptance_rate'] = df['host_acceptance_rate'].str.rstrip('%').astype('float') / 100.0
    df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype('float')
    price = df.price
    if drop_price:
        df = df.drop('price', axis="columns")
    if drop_amenities:
        df = df.drop('amenities', axis="columns")
    
    df.host_is_superhost = df.host_is_superhost.map({'t':1, "f":0})
    df.host_has_profile_pic = df.host_has_profile_pic.map({'t':1, "f":0})
    df.host_identity_verified = df.host_identity_verified.map({'t':1, "f":0})
    df.has_availability = df.has_availability.map({'t':1, "f":0})
    df.instant_bookable = df.instant_bookable.map({'t':1, "f":0})

    return df, price

# Returns a set of all amenities
def get_amenities_set(df):
    amenities = set()
    for r in df.amenities:
        amenities = amenities.union(set(ast.literal_eval(r)))
    return amenities

# Returns a dict of {amenity: # of occurances in all rental properties in df}
def get_amenities_dict(df, cutoff=25):
    amenities = collections.Counter()
    for r in df.amenities:
        amenities.update(ast.literal_eval(r))
    return dict(collections.Counter({a: c for a, c in amenities.items() if c >= cutoff}))

def create_and_populate_amenity_cols(df, cutoff=25):
    amenities = [s.lower() for s in list(get_amenities_dict(df, cutoff).keys())]
    ndf = df.copy()
    ndf = pd.concat(
    [
        ndf,
        pd.DataFrame(
            [[0] * len(amenities)], 
            index=ndf.index, 
            columns=amenities
        )
    ], axis=1
    )
    counter = 0
    for r in ndf.index: # Set value of all amenities a rental has to 1, rest 0
        if not pd.isnull(ndf.loc[r, 'amenities']):
            ams_list = [s.lower() for s in ast.literal_eval(ndf.loc[r, 'amenities']) if s.lower() in amenities]
            ams_dict = {a: 0 for a in amenities}
            for a in ams_list:
                ams_dict[a] = 1
            ndf.loc[r, ams_dict.keys()] = ams_dict.values()
    return ndf

## Data Encoding

In [6]:
df, price = clean_and_drop(load_data('ca', 'los-angeles'), True, False)
# a_df = create_and_populate_amenity_cols(df)
# a_df
a_df = df

  df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype('float')


In [15]:
categorical_cols = ["host_response_time", "neighbourhood_cleansed", "property_type", "bathrooms_text", "room_type"]
a_df = pd.get_dummies(a_df, columns=categorical_cols)


In [13]:
# Checking amenities encoding

# a_df_copy = a_df.copy()
# a_df.to_csv('./amenities_encoded_test.csv', errors='surrogatepass')
a_df

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,room_type,...,bathrooms_text_7.5 baths,bathrooms_text_8 baths,bathrooms_text_8 shared baths,bathrooms_text_8.5 baths,bathrooms_text_8.5 shared baths,bathrooms_text_9 baths,bathrooms_text_9.5 baths,bathrooms_text_Half-bath,bathrooms_text_Private half-bath,bathrooms_text_Shared half-bath
0,1.00,0.86,0.0,1.0,1.0,1.0,1.0,34.106320,-118.223610,Private room,...,0,0,0,0,0,0,0,0,0,0
1,1.00,0.93,1.0,8.0,8.0,1.0,1.0,34.095740,-118.277880,Entire home/apt,...,0,0,0,0,0,0,0,0,0,0
2,1.00,0.81,1.0,8.0,8.0,1.0,1.0,33.987500,-118.432000,Private room,...,0,0,0,0,0,0,0,0,0,0
3,1.00,0.81,1.0,8.0,8.0,1.0,1.0,33.987500,-118.432000,Private room,...,0,0,0,0,0,0,0,0,0,0
4,1.00,,0.0,1.0,1.0,1.0,1.0,33.983010,-118.386070,Entire home/apt,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42036,,,0.0,0.0,0.0,1.0,1.0,33.764591,-117.992373,Private room,...,0,0,0,0,0,0,0,0,0,0
42037,0.93,0.93,0.0,44.0,44.0,1.0,1.0,33.742780,-117.942232,Private room,...,0,0,0,0,0,0,0,0,0,0
42038,0.93,0.93,0.0,44.0,44.0,1.0,1.0,33.744660,-117.941931,Private room,...,0,0,0,0,0,0,0,0,0,0
42039,0.98,0.98,0.0,0.0,0.0,1.0,1.0,33.743250,-117.993990,Shared room,...,0,0,0,0,0,0,0,0,0,0


In [310]:
# One-hot-encode data for passing into models
encoder = sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore')
encoder.fit(df)
df_onehot = encoder.fit_transform(df).toarray()

# Model Operations

In [311]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model

In [1]:
knn_reg_model = KNeighborsRegressor()
knn_reg_model.fit(df_onehot, price)

ridge_reg_model = linear_model.Ridge(alpha=0.5)
ridge_reg_model.fit(df_onehot, price)

NameError: name 'KNeighborsRegressor' is not defined

In [76]:
knn_reg_model.predict(df_onehot[96][None, :])

array([648.])