In [13]:
import pandas as pd
import math
import numpy as np

In [4]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import geopy.distance

def create_all_location_features(listings_filepath, onehot_filepath, min_price=30, max_price=500):
    #Read in the dataset
    la = pd.read_csv('listings.csv')
    
    # === Preprocessing ===
    la['price'] = la['price'].map(lambda x: float(x.replace('$','').replace(',','')))
    
    #Get rid of extreme price outliers
    la = la[(la['price'] > min_price) & (la['price'] < max_price)]
    
    la = create_neighborhood_avg_price(la)
    
    la = create_cluster_avg_price(la)
    
    la = create_nearby_avg_price(la)
    
    la = create_distance_to_pois(la)
    
    df_onehot = merge_with_onehot(la,onehot_filepath,min_price,max_price)
    
    return df_onehot
    
    
def create_neighborhood_avg_price(la):
    la_locations = la[['longitude','latitude']]
    
    neighborhood_averages = {}
    neighbourhood_counts = {}

    for neighborhood in la['neighbourhood_cleansed'].unique():
        neighborhood_averages[neighborhood] = la[la['neighbourhood_cleansed'] == neighborhood]['price'].mean()
        neighbourhood_counts[neighborhood] = len(la[la['neighbourhood_cleansed'] == neighborhood])

    la['neighbourhood_average_price'] = la['neighbourhood_cleansed'].map(neighborhood_averages)
    la['num_in_neighbourhood'] = la['neighbourhood_cleansed'].map(neighbourhood_counts)
    
    return la


def create_cluster_avg_price(la):
    la_locations = la[['longitude','latitude']]
    
    clustering = DBSCAN(eps=.01).fit(la_locations)
    la['cluster_number'] = clustering.labels_

    #Get average price of each cluster
    cluster_averages = {}
    for i in la['cluster_number'].unique():
        cluster_averages[i] = la[la['cluster_number']==i]['price'].mean()

    la['cluster_average_price'] = la['cluster_number'].map(cluster_averages)
    
    #include number in cluster too
    cluster_counts = {}
    for cluster in la['cluster_number'].unique():
        cluster_counts[cluster] = len(la[la['cluster_number'] == cluster])

    la['num_in_cluster'] = la['cluster_number'].map(cluster_counts)
    
    la = la.drop(['cluster_number'],axis='columns')

    return la

def create_nearby_avg_price(la):
    la_locations = la[['longitude','latitude']]
    
    knn = NearestNeighbors(n_neighbors=10)
    knn.fit(la_locations)

    distances, indices = knn.kneighbors(la_locations)

    #Could tweak this to only include properties within a certain distance of each other
    avg_nearby_prices = [0] * len(la_locations)
    for i in range(len(indices)):
        closest_points = indices[i,1:]
        avg_nearby_prices[i] = la.iloc[closest_points]['price'].mean()

    la['nearby_average_price'] = avg_nearby_prices
    
    return la

import geopy.distance

def create_distance_to_pois(la):
    points_of_interest = [(34.011535661131354, -118.50198735702507),
                      (34.047548066330336, -118.5633340820288),
                      (34.071312584445586, -118.35770894822),
                      (34.07931102198564, -118.47415412068081),
                      (34.0675483525067, -118.36083308699335),
                      (34.10471281374351, -118.34294029255402),
                      (34.11858649819086, -118.29835031188001),
                      (34.05648995479435, -118.24864810412235),
                      (34.045352,-118.245733),
                      (la['latitude'].mean(), la['longitude'].mean())]
    
    for i in range(len(points_of_interest)):
        column_name = 'dist_to_poi_' + str(i)
        poi_coord = points_of_interest[i]

        distances_to_poi = [0] * len(la)

        j = 0
        for lat,long in zip(la['latitude'],la['longitude']):
            distances_to_poi[j] = geopy.distance.geodesic(poi_coord,(lat,long))
            j+=1

        la[column_name] = [float(str(x).replace(' km','')) for x in distances_to_poi]
    
    return la

def merge_with_onehot(la,onehot_filepath,min_price,max_price):
    df_onehot = pd.read_csv('la_onehot.csv')
    df_onehot = df_onehot[(df_onehot['price'] > min_price) & (df_onehot['price'] < max_price)]

    df_onehot = df_onehot.merge(la[['id','neighbourhood_average_price','num_in_neighbourhood','cluster_average_price','num_in_cluster',
                                'nearby_average_price','dist_to_poi_0','dist_to_poi_1','dist_to_poi_2','dist_to_poi_3','dist_to_poi_4','dist_to_poi_5',
                               'dist_to_poi_6','dist_to_poi_7','dist_to_poi_8','dist_to_poi_9']],on='id')

    df_onehot = df_onehot.drop(['Unnamed: 0','id','description','host_since','neighbourhood','amenities','bathrooms'],axis="columns").dropna()

    price = df_onehot['price']
    df_onehot = df_onehot.drop(['price'],axis="columns")

    return df_onehot,price

In [5]:
df_onehot,price = create_all_location_features('listings.csv','la_onehot.csv')

In [44]:
df_onehot = df_onehot.dropna()

In [45]:
df_onehot

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,...,dist_to_poi_0,dist_to_poi_1,dist_to_poi_2,dist_to_poi_3,dist_to_poi_4,dist_to_poi_5,dist_to_poi_6,dist_to_poi_7,dist_to_poi_8,dist_to_poi_9
0,1.00,0.86,0.0,1.0,1.0,1.0,1.0,34.106320,-118.223610,3,...,27.767471,32.027507,12.970705,23.314546,13.374713,11.012096,7.028714,5.990985,7.064366,11.889370
1,1.00,0.93,1.0,8.0,8.0,1.0,1.0,34.095740,-118.277880,4,...,22.701555,26.886221,7.850200,18.205560,8.270265,6.085457,3.160661,5.122048,6.328118,11.198290
2,1.00,0.81,1.0,8.0,8.0,1.0,1.0,33.987500,-118.432000,1,...,6.993902,13.839096,11.554331,10.902631,11.047144,15.383921,19.070690,18.583346,18.362691,17.834319
3,1.00,0.81,1.0,8.0,8.0,1.0,1.0,33.987500,-118.432000,1,...,6.993902,13.839096,11.554331,10.902631,11.047144,15.383921,19.070690,18.583346,18.362691,17.834319
5,0.89,0.84,0.0,7.0,7.0,1.0,1.0,33.919420,-118.209130,2,...,28.930862,35.681915,21.732454,30.232949,21.596771,23.983511,23.579027,15.636405,14.372282,9.365548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36467,0.96,0.94,1.0,5.0,5.0,1.0,1.0,33.705660,-117.887191,9,...,66.239934,73.152206,59.492997,68.305257,59.422613,61.122138,59.528431,51.307933,50.200168,46.140221
36469,1.00,1.00,1.0,1.0,1.0,1.0,0.0,33.791789,-117.881969,6,...,62.311130,69.098824,53.815431,63.361471,53.814163,54.959945,52.870186,44.851410,43.847107,40.337790
36477,1.00,0.96,1.0,7.0,7.0,1.0,1.0,33.695716,-117.956723,1,...,61.427829,68.359688,55.782875,64.042786,55.665969,57.742188,56.553998,48.278641,47.107028,42.708104
36479,1.00,1.00,0.0,0.0,0.0,1.0,1.0,33.807586,-118.008746,4,...,50.917549,57.775174,43.549324,52.531635,43.486269,45.171002,43.663108,35.415130,34.288740,30.182802


## Accuracy testing functions

In [16]:
def get_price_accuracy(price_pred,price_test,interval_halfwidth):
    in_range = [1 if (price_pred[i] >= (price_test.values[i] - interval_halfwidth)) and (price_pred[i] <= (price_test.values[i] + interval_halfwidth)) else 0 for i in range(len(price_test.values))]
    return sum(in_range) / len(in_range)

In [17]:
def get_pct_overpriced(price_pred,price_test,interval_halfwidth):
    in_range = [1 if (pred_prices[i] < (price_test.values[i] - interval_halfwidth)) else 0 for i in range(len(price_test.values))]
    return sum(in_range) / len(in_range)

In [18]:
def get_pct_underpriced(price_pred,price_test,interval_halfwidth):
    in_range = [1 if (pred_prices[i] > (price_test.values[i] + interval_halfwidth)) else 0 for i in range(len(price_test.values))]
    return sum(in_range) / len(in_range)

## Train-Test Split

In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [46]:
indexes = pd.Series(df_onehot.index)
X_train, X_test, y_train, y_test = train_test_split(indexes, indexes, test_size=0.20, random_state=42)

In [47]:
listings_train = df_onehot.loc[X_train.values]
price_train = price.loc[y_train.values]

listings_test = df_onehot.loc[X_test.values]
price_test = price.loc[y_test.values]

print("Train Size: ", len(X_train), 'Test Size:', len(X_test))

Train Size:  16726 Test Size: 4182


## Note

NOTE - this notebook is provided as a cleaned example of how the geographically weighted models were tested
In our experiments we tested many more hyperparameter configurations while applying different preprocessing to the 
dataset, which led to improved results in the final product

## Geographically weighted linear regression

In [50]:
bandwidth = 107
for poi_num in range(0,10):
    w = np.exp(-.5 * (listings_train['dist_to_poi_'+str(poi_num)] / bandwidth)**2)
    
    lm = linear_model.LinearRegression()
    lm.fit(listings_train,price_train,w)
    
    pred_prices = lm.predict(listings_test)
    
    #print('==== POI #', poi_num)
    error = mean_absolute_error(pred_prices,price_test)
    #print("Linear Regressor:\nError:",error)
    percentage_correct = get_price_accuracy(pred_prices,price_test,25)
    #print('Linear Regressor % Correct: ',percentage_correct,'\n')

## Geographically weighted random forest

In [30]:
def train_weighted_rf(listings_train,price_train,listings_test,price_test):
    print('Baseline: ')
    rf_model = RandomForestClassifier(max_depth = 8)
    rf_model.fit(listings_train, price_train)
    
    pred_prices = rf_model.predict(listings_test)
    #error = pred_price - target_price
    error = mean_absolute_error(pred_prices,price_test)
    print("RF:\nError:",error)
    percentage_correct = get_price_accuracy(pred_prices,price_test,25)
    print('RF % Correct: ',percentage_correct,'\n')
    
    
    
    # === Weighted Random Forest ===
    #weighted_lin_reg_model = linear_model.LinearRegression()
    for i in range(0,10):
        weighted_rf_model = RandomForestClassifier(max_depth = 8)

        #Weighting
        weighted_listings_train = listings_train.copy(deep=True)
        weighted_listings_test = listings_test.copy(deep=True)
        for column in listings_train.columns:
            weighted_listings_train[column] = listings_train[column] * listings_train['dist_to_poi_'+str(i)]
            weighted_listings_test[column] = listings_test[column] * listings_test['dist_to_poi_'+str(i)]

        weighted_rf_model.fit(weighted_listings_train, price_train)

        ########### 
        pred_prices = weighted_rf_model.predict(weighted_listings_test)
        #error = pred_price - target_price
        error = mean_absolute_error(pred_prices,price_test)
        print('Poi number ',i)
        print("Error:",error)
        percentage_correct = get_price_accuracy(pred_prices,price_test,25)
        print('% Correct: ',percentage_correct,'\n\n\n')

In [32]:
train_weighted_rf(listings_train,price_train,listings_test,price_test)

## Training separate models for separate clusters

In [33]:
def train_separate_models(listings_train,price_train,listings_test,price_test,num_clusters):
    #Clustering
    kmeans = KMeans(n_clusters = num_clusters).fit(listings_train[['latitude','longitude']])
    
    cluster_model_mappings = {}
    
    #Train a model for members of each cluster
    for cluster in set(kmeans.labels_):
        cluster_points = listings_train[kmeans.labels_==cluster]
        cluster_prices = price_train[kmeans.labels_==cluster]
        
        #current_cluster_model = RandomForestClassifier(max_depth=50)
        #current_cluster_model = KNeighborsRegressor()
        #current_cluster_model = linear_model.Ridge(alpha=0.5)
        current_cluster_model = linear_model.LinearRegression()
        
        current_cluster_model.fit(cluster_points,cluster_prices)
        cluster_model_mappings[cluster] = current_cluster_model
    
    #Test
    predicted_clusters = kmeans.predict(listings_test[['latitude','longitude']])
    price_predictions = [0] * len(price_test)

    #Need to do this for all member points of a cluster, not one point at a time
    for cluster in set(predicted_clusters):
        cluster_points = listings_test[predicted_clusters==cluster]
        cluster_indices = listings_test[predicted_clusters==cluster].index
        predicted_price = cluster_model_mappings[predicted_clusters[cluster]].predict(cluster_points)
#         print('Cluster indices', list(cluster_indices))
#         print('Predicted prices',predicted_price)
        for i in cluster_points.index:
            price_predictions[list(price_test.index).index(i)] = predicted_price[list(cluster_indices).index(i)]
#             print('Price prediction index',list(price_test.index).index(i))
#             print('Predicted price index',list(cluster_indices).index(i))
#         print('\n\n\n')
#         print(list(price_predictions))
            
            
    error = mean_absolute_error(price_predictions,price_test)
    print("Random Forest:\nError:",error)
    percentage_correct = get_price_accuracy(price_predictions,price_test,25)
    print('Random Forest % Correct: ',percentage_correct,'\n')
    
    return error,percentage_correct

In [35]:
train_separate_models(listings_train,price_train,listings_test,price_test,num_clusters=3)

Random Forest:
Error: 8991.375206510516
Random Forest % Correct:  0.38570062171209946 



(8991.375206510516, 0.38570062171209946)

In [38]:
cluster_nums = []
errors = []
pcts_correct = []
for i in range(1,15):
    current_error, current_pct_correct = train_separate_models(
                                            listings_train,
                                            price_train,listings_test,
                                            price_test,
                                            num_clusters=i)
    cluster_nums.append(i)
    errors.append(current_error)
    pcts_correct.append(current_pct_correct)

## Random forest runtime analysis

In [40]:
from time import perf_counter, sleep
depths = []
errors = []
pcts = []

for i in range(8,25):
    # Random Forest

    start = perf_counter()
    # Some Code
    rf_model = RandomForestClassifier(max_depth = i)
    rf_model.fit(listings_train, price_train)
    end = perf_counter()
    
    pred_prices = rf_model.predict(listings_test)
    error = mean_absolute_error(pred_prices,price_test)
    pct_correct = get_price_accuracy(pred_prices,price_test,25)
    
    depths.append(i)
    errors.append(error)
    pcts.append(pct_correct)

    print('Depth: ',i)
    print(f"Time taken to execute code : {end-start}")
    print('Error: ',error)
    print('Pct Correct: ',pct_correct)
    print('===========\n\n\n')