In [1]:
from sklearn.metrics import mean_squared_error
from math import sqrt # rms = sqrt(mean_squared_error(y_true, y_predicted))
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import pickle

# input dataset
rew = pd.read_csv("../data/REW_dataset.csv")
rew_latlong = pd.read_csv("../data/rew_data/property_listing_latlong.csv")
#get required columns
cols = list(rew.columns)

type_list = cols[cols.index('House'):cols.index('Multifamily')+1]

area_list = cols[cols.index('Whalley'):cols.index('Pitt Meadows Rural')+1]

postal_list = cols[cols.index('V3S'):cols.index('V0Y')+1]

features_list = cols[cols.index('Drapes/window Coverings'):cols.index('Vacuum Blt. In')+1]

In [2]:
# subset area df=df['area'==]
# type_list has house types
# cols used
req_cols = ['price','listing_id','bed','bath','area_sqft','fireplaces']+type_list

rew = rew[req_cols]

# Data Used for recommendation
rew.head(2)

Unnamed: 0,price,listing_id,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
0,399800,R2333259,2,2.0,1208.0,1.0,0,0,1,0,0,0,0
1,2550000,R2277753,4,6.0,3312.0,1.0,1,0,0,0,0,0,0


In [3]:
#Input Data Creation

samp_price = 399800

samp_bed = 3 

samp_bath = 2

samp_area_sqft = 1300

samp_fireplaces = 1

# use a drop down - type of house
samp_ptype = "House"

sample_df = pd.DataFrame({"price":samp_price,"bed":samp_bed,"bath":samp_bath,\
                          "area_sqft":samp_area_sqft,"fireplaces":samp_fireplaces},index=[0])

sample_df[samp_ptype]=1

type_cols = set(type_list)-(set({samp_ptype}))

for col in type_cols:
    sample_df[col]=0

#Columns Used - reorder  
sample_df = sample_df[['price', 'bed', 'bath', 'area_sqft', 'fireplaces', 'House', 'Apt/Condo',
       'Townhouse', 'Land/Lot', 'Duplex', 'Mfd/Mobile Home', 'Multifamily']]

#Input DF    
sample_df

Unnamed: 0,price,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
0,399800,3,2,1300,1,1,0,0,0,0,0,0


In [4]:
# rew data without listing_id
sim_data = rew[['price','listing_id','bed','bath','area_sqft','fireplaces']+type_list]
sim_data.set_index('listing_id',inplace=True)

#Add this row on top of all records
sim_numeric = pd.concat([sample_df, sim_data], axis=0)

#Normalize
sim_normal = (sim_numeric - sim_numeric.min()) / (sim_numeric.max() - sim_numeric.min())

#Normalized sample
normal_sample = sim_normal.iloc[0,:]

#normalized sim_data
sim_normal = sim_normal.iloc[1:,:]

In [5]:
sim_numeric

Unnamed: 0,price,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
0,399800,3,2.0,1300.0,1.0,1,0,0,0,0,0,0
R2333259,399800,2,2.0,1208.0,1.0,0,0,1,0,0,0,0
R2277753,2550000,4,6.0,3312.0,1.0,1,0,0,0,0,0,0
R2331580,398800,2,2.0,1015.0,1.0,0,1,0,0,0,0,0
R2334962,1098000,2,2.0,913.0,0.0,0,1,0,0,0,0,0
R2343106,1499000,3,3.0,1676.0,1.0,0,0,1,0,0,0,0
R2343523,999000,6,6.0,2766.0,1.0,1,0,0,0,0,0,0
R2333419,1638000,3,2.0,1374.0,1.0,0,1,0,0,0,0,0
R2339559,755000,2,2.0,980.0,1.0,0,1,0,0,0,0,0
R2340646,569900,3,3.0,1542.0,1.0,0,0,1,0,0,0,0


In [21]:
print(sim_numeric.shape)

(14931, 12)


In [9]:
from sklearn.neighbors import NearestNeighbors

#train model
nbrs = NearestNeighbors(n_neighbors=30, metric='euclidean',algorithm='auto').fit(sim_normal)

In [None]:
import pickle
# now you can save it to a file
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(nbrs, f)

In [5]:
# and later you can load it
with open('../data/knn_model.pkl', 'rb') as f:
    loaded_model_knn = pickle.load(f)



In [6]:
# get model results on smaple data
vals = loaded_model_knn.kneighbors([normal_sample])

# get indices
indices = list(vals[1][0])

In [7]:
indices

[12431,
 7632,
 11981,
 1840,
 2358,
 6070,
 2239,
 132,
 2851,
 11577,
 2497,
 7031,
 10252,
 697,
 13193,
 3841,
 7099,
 8614,
 1234,
 5306,
 13188,
 14541,
 9819,
 2105,
 11040,
 12819,
 3332,
 9748,
 8170,
 13307]

In [9]:
# KNN - 30 Similar records
sim_data.iloc[indices,:].head()

Unnamed: 0_level_0,price,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
R2322768,435000,3,2.0,1352.0,1.0,1,0,0,0,0,0,0
R2347609,599998,3,2.0,1350.0,1.0,1,0,0,0,0,0,0
R2329021,689500,3,2.0,1270.0,1.0,1,0,0,0,0,0,0
R2342852,688900,3,2.0,1254.0,1.0,1,0,0,0,0,0,0
R2338978,719000,3,2.0,1325.0,1.0,1,0,0,0,0,0,0


In [8]:
sim_data.iloc[indices,:]

Unnamed: 0_level_0,price,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
R2322768,435000,3,2.0,1352.0,1.0,1,0,0,0,0,0,0
R2347609,599998,3,2.0,1350.0,1.0,1,0,0,0,0,0,0
R2329021,689500,3,2.0,1270.0,1.0,1,0,0,0,0,0,0
R2342852,688900,3,2.0,1254.0,1.0,1,0,0,0,0,0,0
R2338978,719000,3,2.0,1325.0,1.0,1,0,0,0,0,0,0
R2341114,728800,3,2.0,1266.0,1.0,1,0,0,0,0,0,0
R2333063,730000,3,2.0,1405.0,1.0,1,0,0,0,0,0,0
R2339997,729000,3,2.0,1420.0,1.0,1,0,0,0,0,0,0
R2348231,749900,3,2.0,1420.0,1.0,1,0,0,0,0,0,0
R2319752,799000,3,2.0,1308.0,1.0,1,0,0,0,0,0,0


In [18]:
sim_data.reset_index(level=0, inplace=True)

In [19]:
sim_data

Unnamed: 0,listing_id,price,bed,bath,area_sqft,fireplaces,House,Apt/Condo,Townhouse,Land/Lot,Duplex,Mfd/Mobile Home,Multifamily
0,R2333259,399800,2,2.0,1208.0,1.0,0,0,1,0,0,0,0
1,R2277753,2550000,4,6.0,3312.0,1.0,1,0,0,0,0,0,0
2,R2331580,398800,2,2.0,1015.0,1.0,0,1,0,0,0,0,0
3,R2334962,1098000,2,2.0,913.0,0.0,0,1,0,0,0,0,0
4,R2343106,1499000,3,3.0,1676.0,1.0,0,0,1,0,0,0,0
5,R2343523,999000,6,6.0,2766.0,1.0,1,0,0,0,0,0,0
6,R2333419,1638000,3,2.0,1374.0,1.0,0,1,0,0,0,0,0
7,R2339559,755000,2,2.0,980.0,1.0,0,1,0,0,0,0,0
8,R2340646,569900,3,3.0,1542.0,1.0,0,0,1,0,0,0,0
9,R2342344,958000,2,2.0,815.0,0.0,0,1,0,0,0,0,0


In [10]:
# required listing Ids
req_list_ids = sim_data.index.values
print(req_list_ids)

['R2333259' 'R2277753' 'R2331580' ... 'R2335334' 'R2295410' 'R2338949']


In [12]:
result = pd.merge(rew_latlong,sim_data.iloc[indices,:],on="listing_id")

In [15]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 30 columns):
price_x             30 non-null object
postal_code         30 non-null object
street_address      30 non-null object
listing_id          30 non-null object
bed_x               30 non-null int64
bath_x              30 non-null float64
area_sqft_x         30 non-null float64
type                30 non-null object
age                 28 non-null object
taxes               0 non-null float64
subarea             30 non-null object
style               30 non-null object
features            22 non-null object
amenities           18 non-null object
fireplaces_x        30 non-null float64
property_address    30 non-null object
lat                 30 non-null float64
lon                 30 non-null float64
price_y             30 non-null int64
bed_y               30 non-null int64
bath_y              30 non-null float64
area_sqft_y         30 non-null float64
fireplaces_y        30 non-null flo

In [21]:
latlon = result[['listing_id','lat','lon']]

In [22]:
latlon

Unnamed: 0,listing_id,lat,lon
0,R2339997,49.288825,-123.111121
1,R2347177,49.121556,-122.857322
2,R2333543,49.261767,-123.161946
3,R2342852,49.227795,-122.580927
4,R2327470,49.270942,-122.748315
5,R2333063,49.267393,-122.792314
6,R2338978,49.304258,-123.144252
7,R2348455,49.288825,-123.111121
8,R2348231,49.304258,-123.144252
9,R2345744,49.201333,-122.802818


In [23]:
latlon.values.tolist()

[['R2339997', 49.28882479999999, -123.1111209],
 ['R2347177', 49.1215556, -122.85732240000002],
 ['R2333543', 49.2617671, -123.1619461],
 ['R2342852', 49.2277948, -122.5809272],
 ['R2327470', 49.2709417, -122.74831470000001],
 ['R2333063', 49.2673935, -122.79231419999999],
 ['R2338978', 49.30425839999999, -123.14425220000001],
 ['R2348455', 49.28882479999999, -123.1111209],
 ['R2348231', 49.30425839999999, -123.14425220000001],
 ['R2345744', 49.20133329999999, -122.8028183],
 ['R2330529', 49.28882479999999, -123.1111209],
 ['R2343051', 49.28882479999999, -123.1111209],
 ['R2341114', 49.28858049999999, -122.7675284],
 ['R2332921', 49.1348044, -122.86761899999999],
 ['R2341836', 49.28882479999999, -123.1111209],
 ['R2347609', 49.1324639, -122.86331840000001],
 ['R2339132', 49.28882479999999, -123.1111209],
 ['R2336814', 49.28882479999999, -123.1111209],
 ['R2335385', 49.28882479999999, -123.1111209],
 ['R2331907', 49.28882479999999, -123.1111209],
 ['R2341890', 49.1760485, -122.8861567],