<a href="https://colab.research.google.com/github/AmirhosseinSalamirad/House-Price-Valuation/blob/main/HousePrice_NewApproach_emadHamed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#House Price Valuation From Visual and Textual Features
This code is developed to test the methodology presented in *Ahmed, E., & Moustafa, M. (2016). House price estimation from visual and textual features. arXiv preprint arXiv:1609.08399.*

In [None]:
import json, re, os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import neighbors
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, label_binarize,MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
!sudo python3 -m pip install opencv-python==3.4.2.16
!sudo python3 -m pip install opencv-contrib-python==3.4.2.16



# Testing old paper approach

In [None]:
!git clone https://github.com/emanhamed/Houses-dataset.git

Cloning into 'Houses-dataset'...
remote: Enumerating objects: 2166, done.[K
remote: Total 2166 (delta 0), reused 0 (delta 0), pack-reused 2166[K
Receiving objects: 100% (2166/2166), 176.26 MiB | 24.02 MiB/s, done.
Resolving deltas: 100% (20/20), done.


In [None]:
# import cv2
# import os
# import numpy as np

# rootdir = "/content/Houses-dataset/Houses Dataset/"
# surf = cv2.xfeatures2d.SURF_create()
# final_results = []

# img_types = ['bathroom', 'bedroom', 'frontal', 'kitchen']
# for img_idx in range(1, 21):#536):
# 	temp_results = []
# 	for img_type in img_types:
# 		image = cv2.imread(rootdir + str(img_idx) + '_' + img_type + '.jpg')
# 		image = image.astype('uint8')
# 		gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# 		(kps, descs) = surf.detectAndCompute(gray, None)
# 		temp_results.append(descs[:4, :].flatten())
# 	final_results.append(temp_results)
# 	print(img_idx)

In [None]:
# final_results = np.array(final_results)
# final_results = final_results.reshape(20,-1)
# final_results.shape

In [None]:
house_data = pd.read_csv('HousesInfo.csv')
SURF_features = pd.read_csv('SURF_features.csv')
print(house_data.shape)
print(SURF_features.shape)

In [None]:
a = pd.concat([SURF_features,house_data],1)
a.shape

In [None]:
def train_model(dataset, base_model, params=None, verbose=0):

    numeric_features = ['num_bedroom',	'num_bathroom',	'house_area','zipcode']
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler()),
        # ('scaler', RobustScaler()),
        # ('scaler', MinMaxScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
                    ])

    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    pipe = Pipeline(steps=[
        # ('preprocessor', preprocessor),
        ('regressor', base_model)],
        verbose=False)

    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    print('X_train.shape:\t{}'.format(X_train.shape))
    print('X_test.shape:\t{}'.format(X_test.shape))

    if params:
        gs = GridSearchCV(pipe, params, cv=3, iid=False, n_jobs=-1, verbose=verbose)
        gs.fit(X_train, y_train)
        print("model score: {:.3}".format(gs.score(X_test, y_test)))
        print('Best Params:{}'.format(gs.best_params_))
        print('Best Score:{:.3}'.format(gs.best_score_))
        y_pred = gs.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        print('MSE = \t{:.4}'.format(mse))
        mae = mean_absolute_error(y_test, y_pred)
        print('MAE = \t{:.4}'.format(mae))
        cv_score = cross_val_score(pipe, X_test, y_test, scoring=make_scorer(mean_squared_error), cv=5)
        print('cv_scores = \t{}'.format(cv_score))
        print("CV MSE =\t{:.4} (+/- {:.3})".format(cv_score.mean(), cv_score.std() * 2))
        return gs

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print('MSE = \t{:.4}'.format(mse))
    mae = mean_absolute_error(y_test, y_pred)
    print('MAE = \t{:.4}'.format(mae))
    cv_score = cross_val_score(pipe, X_test, y_test, cv=5)
    print('cv_score = \t{}'.format(cv_score))
    print("CV MSE =\t{:.4} (+/- {:.3})".format(cv_score.mean(), cv_score.std() * 2))

    return pipe


In [None]:
%%time
regressor = neighbors.KNeighborsRegressor()
# params = None
params = {'regressor__n_neighbors': list(range(15,25)), 'regressor__weights': ['uniform', 'distance']}
# Best Params:{'regressor__n_neighbors': 24, 'regressor__weights': 'distance'}
train_model(a, regressor, params)

In [None]:
%%time
regressor = GradientBoostingRegressor()
# regressor = GradientBoostingRegressor(
#         learning_rate = 0.1,
#         max_depth = 6,
#         max_features = 0.3,
#         min_samples_leaf = 9,
#         n_estimators = 100
#     )
# params = None
# Best Params:{'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__max_features': 0.3, 'regressor__min_samples_leaf': 3, 'regressor__n_estimators': 100}
params = {
            'regressor__n_estimators':[100],
            'regressor__learning_rate': [0.1, 0.05, 0.02, 0.01],
            'regressor__max_depth':[4,6],
            'regressor__min_samples_leaf':[3,5,9,17],
            'regressor__max_features':[1.0,0.3,0.1]
        }

train_model(a, regressor, params)

In [None]:
# X = house_data.iloc[:, :-1]
y = a.iloc[:, -1]

target_scaler = MinMaxScaler()
q = target_scaler.fit_transform(y[:, np.newaxis])
a['sold_price'] = q

In [None]:
a['sold_price'].describe()

In [None]:
# from scipy import stats
# import numpy as np

# q = house_data
# print('house_data.shape before removal: {}'.format(house_data.shape))

# for i in range(15):
#     z = np.abs(stats.zscore(q['sold_price']))
#     q = q[(z < 3)]
#     # print(q.shape)

# house_data = q
# print('house_data.shape after removal: {}'.format(house_data.shape))

In [None]:
# q_scaler = MinMaxScaler()
# q = q_scaler.fit_transform(house_data)
# house_data = pd.DataFrame(q)
# house_data.describe()

# Training Function

In [None]:
def train_model(dataset, base_model, params=None, verbose=0):

    numeric_features = ['bedrooms', 'bathrooms', 'area',]
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler()),
        # ('scaler', RobustScaler()),
        # ('scaler', MinMaxScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
                    ])

    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    pipe = Pipeline(steps=[
        # ('preprocessor', preprocessor),
        ('regressor', base_model)],
        verbose=False)

    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    print('X_train.shape:\t{}'.format(X_train.shape))
    print('X_test.shape:\t{}'.format(X_test.shape))

    if params:
        gs = GridSearchCV(pipe, params, cv=3, iid=False, n_jobs=-1, verbose=verbose)
        gs.fit(X_train, y_train)
        print("model score: {:.3}".format(gs.score(X_test, y_test)))
        print('Best Params:{}'.format(gs.best_params_))
        print('Best Score:{:.3}'.format(gs.best_score_))
        y_pred = gs.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        print('MSE = \t{:.4}'.format(mse))
        mae = mean_absolute_error(y_test, y_pred)
        print('MAE = \t{:.4}'.format(mae))
        cv_score = cross_val_score(pipe, X_test, y_test, scoring=make_scorer(mean_squared_error), cv=5)
        print('cv_scores = \t{}'.format(cv_score))
        print("CV MSE =\t{:.4} (+/- {:.3})".format(cv_score.mean(), cv_score.std() * 2))
        return gs

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print('MSE = \t{:.4}'.format(mse))
    mae = mean_absolute_error(y_test, y_pred)
    print('MAE = \t{:.4}'.format(mae))
    cv_score = cross_val_score(pipe, X_test, y_test, cv=5)
    print('cv_score = \t{}'.format(cv_score))
    print("CV MSE =\t{:.4} (+/- {:.3})".format(cv_score.mean(), cv_score.std() * 2))

    return pipe


# Regressors

## KNN


In [None]:
%%time
regressor = neighbors.KNeighborsRegressor()
# params = None
params = {'regressor__n_neighbors': list(range(15,25)), 'regressor__weights': ['uniform', 'distance']}
# Best Params:{'regressor__n_neighbors': 24, 'regressor__weights': 'distance'}
train_model(house_data, regressor, params)

## XGBoost

In [None]:
%%time
regressor = GradientBoostingRegressor()
# regressor = GradientBoostingRegressor(
#         learning_rate = 0.1,
#         max_depth = 6,
#         max_features = 0.3,
#         min_samples_leaf = 9,
#         n_estimators = 100
#     )
# params = None
# Best Params:{'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__max_features': 0.3, 'regressor__min_samples_leaf': 3, 'regressor__n_estimators': 100}
params = {
            'regressor__n_estimators':[100],
            'regressor__learning_rate': [0.1, 0.05, 0.02, 0.01],
            'regressor__max_depth':[4,6],
            'regressor__min_samples_leaf':[3,5,9,17],
            'regressor__max_features':[1.0,0.3,0.1]
        }

train_model(house_data, regressor, params)

## SVR

In [None]:
%%time
# SVR
regressor = SVR(gamma='auto')
params = {'regressor__kernel': ['linear', 'poly', 'rbf'], 'regressor__C': [0.01, 0.1, 1, 5, 10]}
# params = None
train_model(house_data, regressor, params)

## MLP

In [None]:
%%time
from sklearn.neural_network import MLPRegressor

regressor = MLPRegressor()
params = {'regressor__activation':['identity','logistic', 'tanh', 'relu']}
train_model(house_data, regressor, params)

## DTR

In [None]:
%%time
regressor = DecisionTreeRegressor()
params = {'regressor__criterion': ('mse', 'friedman_mse', 'mae'), 'regressor__splitter': ('best', 'random')}
# params = None
train_model(house_data, regressor, params)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
regressor = AdaBoostRegressor(GradientBoostingRegressor())
# regressor = AdaBoostRegressor(base_estimator=GradientBoostingRegressor(
#         learning_rate = 0.1,
#         max_depth = 6,
#         max_features = 0.3,
#         min_samples_leaf = 9,
#         n_estimators = 100
#     ))
params = {
            'regressor__n_estimators':[50, 100],
            'regressor__learning_rate': [1, 0.5, 0.02],
            'regressor__loss':['linear', 'square', 'exponential'],
        }
train_model(house_data, regressor, params)