# Predictions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from k_mean_cluster_util import assign_centroids
from sklearn.linear_model import SGDRegressor

In [2]:
df = pd.read_csv('./kc_house_data.csv')
df.set_index('id', inplace=True)
df = df[~df.index.duplicated(keep='first')]
df = df.drop(['date'], axis=1)
prices = df['price']
df = df.drop('price', axis=1)
df['price'] = prices
df.head()

Unnamed: 0_level_0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
7129300520,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900.0
6414100192,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000.0
5631500400,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000.0
2487200875,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000.0
1954400510,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000.0


In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bedrooms,21436.0,3.371571,0.929205,0.0,3.0,3.0,4.0,33.0
bathrooms,21436.0,2.117349,0.769913,0.0,1.75,2.25,2.5,8.0
sqft_living,21436.0,2082.704936,919.146469,290.0,1430.0,1920.0,2550.0,13540.0
sqft_lot,21436.0,15135.637852,41538.620606,520.0,5040.0,7614.0,10696.25,1651359.0
floors,21436.0,1.496198,0.540388,1.0,1.0,1.5,2.0,3.5
waterfront,21436.0,0.007604,0.086871,0.0,0.0,0.0,0.0,1.0
view,21436.0,0.235118,0.767092,0.0,0.0,0.0,0.0,4.0
condition,21436.0,3.410384,0.650235,1.0,3.0,3.0,4.0,5.0
grade,21436.0,7.661737,1.174256,1.0,7.0,7.0,8.0,13.0
sqft_above,21436.0,1790.96044,829.026491,290.0,1200.0,1560.0,2220.0,9410.0


In [4]:
def max_price(df, mp):
    return df[df['price'] <= mp]

def min_bedrooms(df, mb):
    return df[df['bedrooms'] >= mb]

def min_bathrooms(df, mb):
    return df[df['bathrooms'] >= mb]

def min_sqft_living(df, ms):
    return df[df['sqft_living'] >= ms]

def min_floors(df, mf):
    return df[df['floors'] >= mf]

def min_year_renovated(df, myr):
    return df[df['yr_renovated'] >= myr]

def min_lat(df, ml):
    return df[df['lat'] >= ml]

def max_lat(df, ml):
    return df[df['lat'] <= ml]

def min_long(df, ml):
    return df[df['long'] >= ml]

def max_long(df, ml):
    return df[df['long'] <= ml]

In [5]:
coefficients = np.array([ 2.95826076e+02,  1.05354857e+03,  2.64951733e+03,  6.39403846e+02,
                        9.90098381e+02,  1.03389743e+03,  1.34733542e+03,  4.34647043e+03,
                        5.98598209e+02, -1.45318435e+03,  4.93630110e+02,  4.20734260e+03,
                        5.87624109e+01, -4.20810267e+04,  1.13511616e+05,  4.61644477e+04,
                        1.10154058e+05,  1.00671271e+05, -2.57383826e+04,  6.37085098e+04,
                        2.59101446e+04,  8.15426837e+04])

intercept = np.array([477587.54409523])

centroids = np.array([[ 4.75343388e+01, -1.22218856e+02,  3.52609415e+05],
                   [ 4.74142630e+01, -1.22235429e+02,  1.96769369e+05],
                   [ 4.76174581e+01, -1.22215019e+02,  1.01796959e+06],
                   [ 4.76152447e+01, -1.22206690e+02,  5.03714999e+05],
                   [ 4.76244488e+01, -1.22195004e+02,  8.71409893e+05],
                   [ 4.76185047e+01, -1.22186274e+02,  7.58020669e+05],
                   [ 4.74511331e+01, -1.22220358e+02,  2.77490028e+05],
                   [ 4.76151207e+01, -1.22208065e+02,  5.76740093e+05],
                   [ 4.76008047e+01, -1.22218290e+02,  4.28626376e+05],
                   [ 4.76213542e+01, -1.22201022e+02,  6.60373193e+05]])

mu = np.array([ 3.33060336e+00,  2.05666184e+00,  1.97976990e+03,  1.47979336e+04,
                1.47819781e+00,  1.71706542e-01,  3.40795708e+00,  7.53544426e+00,
                2.67357574e+02,  1.97102051e+03,  7.61931686e+01,  4.75566330e+01,
               -1.22212570e+02,  9.12114158e-02,  3.19024548e-02,  1.13444656e-01,
                5.06251343e-02,  7.08874986e-02,  1.58650130e-01,  9.58921835e-02,
                1.46825165e-01,  8.16037953e-02])

sigma2 = np.array([[8.40753734e-01, 5.09700418e-01, 6.01082938e+05, 1.63552781e+09,
                   2.89048374e-01, 4.04463887e-01, 4.16067839e-01, 1.07746780e+00,
                   1.67958125e+05, 8.49030884e+02, 1.46240641e+05, 1.99611578e-02,
                   2.02456526e-02, 8.28919038e-02, 3.08846850e-02, 1.00574784e-01,
                   4.80622351e-02, 6.58623725e-02, 1.33479938e-01, 8.66968483e-02,
                   1.25267491e-01, 7.49443844e-02]])

try:
    import joblib
    coefficients, intercept, centroids, mu, sigma2 = joblib.load('trained_data.data')
    print('Loaded Data!')
except:
    print(f'Did not find trained data, so recommencing with hardcoded data!')

Loaded Data!


In [6]:
def get_cluster(lat, long, price, centroids):
    k, _ = centroids.shape
    idx = assign_centroids(np.array([[lat, long, price]]), centroids)
    one_hot = np.zeros(k - 1)
    if idx[0] != 0: one_hot[idx[0] - 1] = 1
    return one_hot

In [7]:
def assemble(house, centroids):
    temp = house.drop(labels=['price', 'sqft_living15', 'sqft_lot15', 'sqft_above', 'waterfront', 'zipcode'])
    X = temp.values
    one_hot = get_cluster(temp['lat'], temp['long'], house.price, centroids) 
    X = np.concatenate([X, one_hot])
    return X

def z_normalize(X, mu, sigma2):
    return (X - mu) / np.sqrt(sigma2)

def predict(X, coefficients, intercept):
    return np.dot(coefficients, X) + intercept

In [8]:
def ask_about(msg):
    print(f'{msg} (Y, N)>>', end=' ')
    res = input().lower()
    return True if res == 'y' else False

In [9]:
if ask_about("Filter by price?"):
    m_price = int(input('Maximum price: '))
    df = max_price(df, m_price)
    print(f'{df.shape[0]} houses have been found!')

if ask_about("Filter by number of bedrooms?"):
    m_bed = int(input('Min bedrooms: '))
    df = min_bedrooms(df, m_bed)
    print(f'{df.shape[0]} houses have been found!')

if ask_about("Filter by number of bathrooms?"):
    m_bath = int(input('Min bathrooms: '))
    df = min_bathrooms(df, m_bath)
    print(f'{df.shape[0]} houses have been found!')
    
if ask_about("Filter by sqft living?"):
    m_sqft_living = int(input('Min sqft living: '))
    df = min_sqft_living(df, m_sqft_living)
    print(f'{df.shape[0]} houses have been found!')
    
if ask_about("Filter by number of floors?"):
    m_sqft_living = int(input('Min sqft living: '))
    df = min_sqft_living(df, m_sqft_living)
    print(f'{df.shape[0]} houses have been found!')
    
if ask_about("Filter by year of renovated?"):
    yr = int(input('Min year of renovation (0 if no renovation): '))
    df = min_year_renovated(df, yr)
    print(f'{df.shape[0]} houses have been found!')
    
if ask_about("Filter by lat and long?"):
    min_lat = int(input('Min lat: '))
    min_long = int(input('Min long: '))
    max_lat = int(input('Max lat: '))
    max_long = int(input('Max long: '))
    df = min_lat(df, min_lat)
    df = max_lat(df, max_lat)
    df = min_long(df, min_long)
    df = max_long(df, max_long)
    print(f'{df.shape[0]} houses have been found!')

predicted = np.zeros(df.shape[0])
for i in range(df.shape[0]):
    X = assemble(df.iloc[i], centroids)
    predicted[i] = predict(z_normalize(X, mu, sigma2), coefficients, intercept)[0]

predicted = predicted.astype(np.int64)
df['predicted'] = predicted

Filter by price? (Y, N)>> y
Maximum price: 2000000
21238 houses have been found!
Filter by number of bedrooms? (Y, N)>> y
Min bedrooms: 6
308 houses have been found!
Filter by number of bathrooms? (Y, N)>> y
Min bathrooms: 5
15 houses have been found!
Filter by sqft living? (Y, N)>> n
Filter by number of floors? (Y, N)>> y
Min sqft living: 4
15 houses have been found!
Filter by year of renovated? (Y, N)>> y
Min year of renovation (0 if no renovation): 0
15 houses have been found!
Filter by lat and long? (Y, N)>> n


In [10]:
df

Unnamed: 0_level_0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,predicted
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3225079035,6,5.0,6050,230652,2.0,0,3,3,11,6050,0,2001,0,98024,47.6033,-121.943,4210,233971,1600000.0,1042966
9126101740,8,5.0,2800,2580,2.0,0,0,3,8,1880,920,1997,0,98122,47.6086,-122.303,1800,2580,490000.0,512802
3185600055,6,5.0,3440,4500,2.0,0,0,3,8,3280,160,2007,0,98055,47.4871,-122.219,1400,5500,495000.0,509154
9175600025,7,6.75,7480,41664,2.0,0,2,3,11,5080,2400,1953,0,98166,47.4643,-122.368,2810,33190,800000.0,794119
9297300740,6,5.25,3600,3960,2.0,0,0,3,7,2400,1200,1971,0,98126,47.5656,-122.372,1450,4600,643500.0,663787
3886901795,6,5.0,2850,6600,2.0,0,0,3,7,2850,0,1994,0,98033,47.6813,-122.187,1870,9900,655000.0,661514
424049043,9,7.5,4050,6504,2.0,0,0,3,7,4050,0,1996,0,98144,47.5923,-122.301,1448,3866,450000.0,442757
627300145,10,5.25,4590,10920,1.0,0,2,3,9,2500,2090,2008,0,98004,47.5861,-122.113,2730,10400,1148000.0,1026579
6744700427,7,5.75,3700,7647,2.0,0,1,3,8,3700,0,1948,1984,98155,47.7393,-122.289,2510,7479,540000.0,525826
7116500920,6,5.25,2860,5682,2.0,0,0,3,7,2860,0,1978,0,98002,47.303,-122.221,1390,5956,300000.0,283078
