In [92]:
import pandas as pd
df = pd.read_csv('data.csv')


In [93]:
def convert_to_sqm(sqft):
    return round((sqft/10.764),2)
metrics_to_convert = ['sqft_living','sqft_above','sqft_basement']
for metric in metrics_to_convert:
    df[metric] = df[metric].apply(convert_to_sqm)

df['zip'] = df.statezip.apply(lambda address: address.split(' ')[1])


In [94]:
df.drop(columns = ['waterfront','view','country','sqft_lot','city','statezip','date','street','sqft_above'], inplace=True)

In [95]:
df.loc[ df.price == 0, "price" ] = df[df.price != 0].mean()[0]
df.loc[ df.bedrooms == 0, "bedrooms" ] = df[df.bedrooms != 0].mean()[1].round(0)
df.loc[ df.bathrooms == 0, "bathrooms" ] = df[df.bathrooms != 0].mean()[2].round(0)
df['price_per_sqm'] = df.price/df.sqft_living

In [96]:
df[['bedrooms','bathrooms','floors','price']] = df[['bedrooms','bathrooms','floors','price']].astype(int)
df.head(63)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,condition,sqft_basement,yr_built,yr_renovated,zip,price_per_sqm
0,313000,3,1,124.49,1,3,0.00,1955,2005,98133,2514.258173
1,2384000,5,2,339.09,2,5,26.01,1921,0,98119,7030.581851
2,342000,3,2,179.30,1,4,0.00,1966,0,98042,1907.417736
3,420000,3,2,185.80,1,4,92.90,1963,0,98008,2260.495156
4,550000,4,2,180.23,1,4,74.32,1976,1992,98052,3051.656217
...,...,...,...,...,...,...,...,...,...,...,...
58,385000,3,3,122.63,2,3,26.01,2008,0,98199,3139.525402
59,295000,2,2,151.43,2,3,32.52,2009,0,98106,1948.094829
60,555000,4,2,307.51,2,3,0.00,2012,1912,98065,1804.819355
61,459990,3,2,248.98,2,3,0.00,2013,1923,98045,1847.497791


In [56]:
X = df.copy()
X.drop(columns = ['price'], inplace = True)
y = df.price
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,condition,sqft_basement,yr_built,yr_renovated,zip,price_per_sqm
0,3,1,124.49,1,3,0.00,1955,2005,98133,2514.258173
1,5,2,339.09,2,5,26.01,1921,0,98119,7030.581851
2,3,2,179.30,1,4,0.00,1966,0,98042,1907.417736
3,3,2,185.80,1,4,92.90,1963,0,98008,2260.495156
4,4,2,180.23,1,4,74.32,1976,1992,98052,3051.656217
...,...,...,...,...,...,...,...,...,...,...
4595,3,1,140.28,1,4,0.00,1954,1979,98133,2196.796882
4596,3,2,135.64,2,3,0.00,1983,2009,98007,3939.349258
4597,3,2,279.64,2,3,0.00,2009,0,98059,1490.860273
4598,4,2,194.17,1,3,94.76,1974,0,98178,1047.535665


## Building ML model

In [57]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error

In [58]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
lr_clf = LinearRegression(fit_intercept=True, normalize=False)
lr_clf.fit(X_train,y_train)
prediction = lr_clf.predict(X_test)
lr_clf.score(X_test,y_test)
mean_absolute_error(prediction,y_test)

79271.07827747856

In [69]:
from sklearn.linear_model import Lasso
ls = Lasso(max_iter = 5000)
ls.fit(X_train,y_train)
prediction = ls.predict(X_test)
mean_absolute_error(prediction,y_test)

79270.87374209265

In [71]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(max_iter=5000,random_state = 42,learning_rate_init = 0.001,learning_rate = 'invscaling', hidden_layer_sizes =  500,activation = 'relu')
mlp.fit(X_train,y_train)
prediction = mlp.predict(X_test)
mean_absolute_error(prediction,y_test)

68261.52908385948

In [70]:
from sklearn.linear_model import ElasticNet
en = ElasticNet(max_iter=4000)
en.fit(X_train,y_train)
prediction = en.predict(X_test)
mean_absolute_error(prediction,y_test)

79067.9532773304

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from numpy import linspace

model = MLPRegressor(random_state = 42)


param_grid = {
    "max_iter" : [4000,5000,6000],
    "hidden_layer_sizes": [300,500,1000],
    "activation" : ["identity", "logistic", "tanh", "relu"],
    "learning_rate": ["constant", "invscaling", "adaptive"],
    "learning_rate_init" : np.linspace(0.001,0.01,3)
    
   
}

RS = RandomizedSearchCV(model, param_grid, scoring =  "neg_mean_absolute_error",cv= 3)


RS.fit(X_test,y_test)

RS.best_params_



In [103]:
import numpy as np
def predict_price(bedrooms,bathrooms,sqft_living,floors,condition,sqft_basement,yr_built,yr_renovated,zip_code):
                x = np.zeros(len(X.columns))
                x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],x[8] =  bedrooms,bathrooms,sqft_living,floors,condition,sqft_basement,yr_built,zip_code,sqft_living/(bedrooms+bathrooms)
                x = pd.DataFrame(x).values.reshape(1,-1)
                return mlp.predict(x)
predict_price(1,1,56,1,4,0,1998,0,98059)

array([485074.9162695])

In [105]:
import pickle
with open('home_prices_model.pickle','wb') as f:
    pickle.dump(mlp,f)

In [106]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))