In [2]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing
import scipy
from tqdm import tqdm

import multiprocessing

import ast
import collections
import os
import shutil
import pickle

import matplotlib as plt
import seaborn as sb

from IPython.display import clear_output

## Methods for getting % good prices, overpriced, and underpriced

In [3]:
def get_pct_correct_price(price_pred,price_test,interval_halfwidth):
    in_range = [1 if (price_pred[i] >= (price_test.values[i] - interval_halfwidth)) and (price_pred[i] <= (price_test.values[i] + interval_halfwidth)) else 0 for i in range(len(price_test.values))]
    return sum(in_range) / len(in_range)

In [4]:
def get_pct_overpriced(price_pred,price_test,interval_halfwidth):
    in_range = [1 if (pred_prices[i] < (price_test.values[i] - interval_halfwidth)) else 0 for i in range(len(price_test.values))]
    return sum(in_range) / len(in_range)

In [5]:
def get_pct_underpriced(price_pred,price_test,interval_halfwidth):
    in_range = [1 if (pred_prices[i] > (price_test.values[i] + interval_halfwidth)) else 0 for i in range(len(price_test.values))]
    return sum(in_range) / len(in_range)

# Predict Price

In [6]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!cd '/content'
!mkdir './project_data/'
!mkdir './project_data/inside_airbnb/'
!mkdir './project_data/inside_airbnb/united-states/'

In [9]:
# Model State and City
model_state = "dc"
model_city = "washington-dc"
model_dir = f"/content/drive/MyDrive/CS 6220/Project/encoded_csvs/united-states/{model_state}/{model_city}"

# Data State and City
data_state = "dc"
data_city = "washington-dc"
data_dir = f"/content/drive/MyDrive/CS 6220/Project/encoded_csvs/united-states/{data_state}/{data_city}"

In [10]:
df_onehot = pd.read_csv(os.path.join(data_dir, f'{data_city}-NO-NA-with-locations.csv'))
model = pickle.load(open(os.path.join(model_dir, f'{model_city}-linear-reg-model.sav'), 'rb'))

In [11]:
print(len(df_onehot))

2805


In [12]:
model_df_onehot = None
price = df_onehot.price

try:
  model_df_onehot = df_onehot.drop(['neighbourhood', 'neighbourhood_cleansed', 'id'], axis="columns").astype('float32')
except:
  model_df_onehot = df_onehot.astype('float32')

model_df_onehot = model_df_onehot.drop(['price'], axis='columns')

pred_prices = model.predict(model_df_onehot)
lr_error = mean_absolute_error(df_onehot.price, pred_prices)
print("Linear Regression Mean Absolute Error:", lr_error)
print("")

Linear Regression Mean Absolute Error: 39.13802118148396



In [13]:
margin_of_error = 25

print("% Correct Price: ", str(get_pct_correct_price(pred_prices,price,margin_of_error ) * 100)[:4] + '%')
print("% Overpriced: ", str(get_pct_overpriced(pred_prices,price,margin_of_error ) * 100 )[:4] + '%')
print("% Underpriced: ", str(get_pct_underpriced(pred_prices,price,margin_of_error ) * 100)[:4] + '%')

% Correct Price:  43.4%
% Overpriced:  25.5%
% Underpriced:  30.9%


In [32]:
col_coefs = list(zip(model.coef_, model_df_onehot))
col_coefs.sort(key=lambda x: abs(x[0]), reverse=True)
print(col_coefs)

[(-172.31558, 'longitude'), (122.7282, 'room_type_Hotel room'), (-92.99076, 'room_type_Shared room'), (50.976917, 'bathrooms_text_4.5 baths'), (48.515163, 'bathrooms_text_3.5 baths'), (-48.224617, 'bathrooms_text_1 bath'), (43.147682, 'latitude'), (-36.928337, 'bathrooms_text_3 shared baths'), (-36.883865, 'room_type_Private room'), (36.008446, 'bathrooms_text_2.5 baths'), (27.911556, 'bedrooms'), (-24.875507, 'bathrooms_text_1 shared bath'), (24.231443, 'bathrooms_text_3 baths'), (22.175215, 'bathrooms_text_4 baths'), (-19.395374, 'bathrooms_text_2 shared baths'), (-18.90445, 'bathrooms_text_1.5 shared baths'), (-13.988963, 'hot water'), (-11.989054, 'bathrooms_text_1 private bath'), (-11.811121, 'smoke alarm'), (-11.10816, 'essentials'), (10.831406, 'host_listings_count'), (-10.79493, 'host_total_listings_count'), (-10.728805, 'bathrooms_text_2.5 shared baths'), (10.337104, 'hair dryer'), (10.012029, 'stove'), (9.41461, 'dishwasher'), (9.289784, 'bed linens'), (8.971242, 'review_scor

In [33]:
col_order = [col[1] for col in col_coefs]
print(col_order)

['longitude', 'room_type_Hotel room', 'room_type_Shared room', 'bathrooms_text_4.5 baths', 'bathrooms_text_3.5 baths', 'bathrooms_text_1 bath', 'latitude', 'bathrooms_text_3 shared baths', 'room_type_Private room', 'bathrooms_text_2.5 baths', 'bedrooms', 'bathrooms_text_1 shared bath', 'bathrooms_text_3 baths', 'bathrooms_text_4 baths', 'bathrooms_text_2 shared baths', 'bathrooms_text_1.5 shared baths', 'hot water', 'bathrooms_text_1 private bath', 'smoke alarm', 'essentials', 'host_listings_count', 'host_total_listings_count', 'bathrooms_text_2.5 shared baths', 'hair dryer', 'stove', 'dishwasher', 'bed linens', 'review_scores_value', 'cooking basics', 'accommodates', 'hangers', 'washer', 'bathrooms_text_3.5 shared baths', 'microwave', 'refrigerator', 'host_acceptance_rate', 'room_type_Entire home/apt', 'shampoo', 'oven', 'bathrooms_text_0 shared baths', 'carbon monoxide alarm', 'kitchen', 'tv', 'wifi', 'dishes and silverware', 'air conditioning', 'free street parking', 'review_scores_

## Finding an Overpriced and Underpriced Property

In [20]:
pd.DataFrame([model_df_onehot.loc[100]])

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,...,dryer,cleaning products,air conditioning,private entrance,tv,neighbourhood_average_price,num_in_neighbourhood,cluster_average_price,num_in_cluster,nearby_average_price
100,100.0,178.0,1.0,4.0,4.0,38.90641,-77.012772,3.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,150.274231,361.0,158.434814,5952.0,133.111115


In [28]:
import random

random1 = random.randint(0, len(model_df_onehot))
random1 = 1535
random1_data = pd.DataFrame([model_df_onehot.loc[random1]])
random1_real_price = price.loc[random1]

random1_pred_price = model.predict(random1_data)[0]
random1_diff = random1_pred_price - random1_real_price

print("Pred Price:", random1_pred_price)
print("Real Price:", random1_real_price)
print("Price Difference:", random1_diff)
print("Overpriced" if random1_diff < 0 else "Underpriced")

Pred Price: 314.74512
Real Price: 375.0
Price Difference: -60.2548828125
Overpriced


In [36]:
random1_reordered = random1_data.reindex(columns=col_order)
random1_reordered.to_csv('random1.csv')
random1_reordered

Unnamed: 0,longitude,room_type_Hotel room,room_type_Shared room,bathrooms_text_4.5 baths,bathrooms_text_3.5 baths,bathrooms_text_1 bath,latitude,bathrooms_text_3 shared baths,room_type_Private room,bathrooms_text_2.5 baths,...,bathrooms_text_5.5 baths,bathrooms_text_8 shared baths,bathrooms_text_11 shared baths,bathrooms_text_6.5 baths,bathrooms_text_5 baths,bathrooms_text_Private half-bath,bathrooms_text_4.5 shared baths,bathrooms_text_Half-bath,bathrooms_text_Shared half-bath,maximum_nights_avg_ntm
1535,-77.01033,0.0,0.0,0.0,0.0,0.0,38.917351,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1125.0


In [63]:
random2 = random.randint(0, len(model_df_onehot))
random2 = 1676
random2_data = pd.DataFrame([model_df_onehot.loc[random2]])
random2_real_price = price.loc[random2]

random2_pred_price = model.predict(random2_data)[0]
random2_diff = random2_pred_price - random2_real_price

print("Pred Price:", random2_pred_price)
print("Real Price:", random2_real_price)
print("Price Difference:", random2_diff)
print("Overpriced" if random2_diff < 0 else "Underpriced")

Pred Price: 277.2744
Real Price: 209.0
Price Difference: 68.2744140625
Underpriced


In [64]:
random2_reordered = random2_data.reindex(columns=col_order)
random2_reordered.to_csv('random2.csv')
random2_reordered

Unnamed: 0,longitude,room_type_Hotel room,room_type_Shared room,bathrooms_text_4.5 baths,bathrooms_text_3.5 baths,bathrooms_text_1 bath,latitude,bathrooms_text_3 shared baths,room_type_Private room,bathrooms_text_2.5 baths,...,bathrooms_text_5.5 baths,bathrooms_text_8 shared baths,bathrooms_text_11 shared baths,bathrooms_text_6.5 baths,bathrooms_text_5 baths,bathrooms_text_Private half-bath,bathrooms_text_4.5 shared baths,bathrooms_text_Half-bath,bathrooms_text_Shared half-bath,maximum_nights_avg_ntm
1676,-76.994766,0.0,0.0,0.0,0.0,0.0,38.893799,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1125.0
