In [1]:
#import the necessities
import numpy as np
import pandas as pd
import statsmodels.api as sm
import warnings
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
warnings.filterwarnings('ignore')

In [2]:
#in the previous section, I had already made a OLS model. no reason to reinvent the wheel
info_df = pd.read_csv('D:\DSF\houseprices_model.csv', sep='\t', header=0)

In [3]:
##functions were already made for tuning and testing regressions
#we'll define a function here that will allow us to test quickly
#below to tune to the best parameters and test after
def lrm_test():
    y = info_df['saleprice']
    X = info_df.drop(['saleprice'], axis=1)
    X = sm.add_constant(X)
    lr = LinearRegression()
    lr.fit(X, y)
    y_test_predictions = lr.predict(X)
    print('OLS Regression')
    print("R-squared: {:0.2f}".format(lr.score(X, y)))
    print("\nMean absolute error of the prediction is: {}".format(mean_absolute_error(y, y_test_predictions)))
    print("Mean squared error of the prediction is: {:3e}".format(mse(y, y_test_predictions)))
    print("Root mean squared error of the prediction is: {}".format(rmse(y, y_test_predictions)))
    print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y - y_test_predictions) / y)) * 100))
    
def knn_test(neighbors, weight):
    y = info_df['saleprice']
    X = info_df.drop(['saleprice'], axis=1)
    X = sm.add_constant(X)
    knn = KNeighborsRegressor(n_neighbors=neighbors, weights=weight)
    knn.fit(X, y)
    y_test_predictions = knn.predict(X)
    score = cross_val_score(knn, X, y, cv=10)
    print('K-Nearest Neighbors Regression')
    print('{} neighbors and {} weighted'.format(neighbors, weight))
    print("R-squared: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
    print("\nMean absolute error of the prediction is: {}".format(mean_absolute_error(y, y_test_predictions)))
    print("Mean squared error of the prediction is: {:3e}".format(mse(y, y_test_predictions)))
    print("Root mean squared error of the prediction is: {}".format(rmse(y, y_test_predictions)))
    print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y - y_test_predictions) / y)) * 100))

In [4]:
#calling the function for the OLS Regression
lrm_test()

OLS Regression
R-squared: 0.90

Mean absolute error of the prediction is: 15683.151557663708
Mean squared error of the prediction is: 6.294095e+08
Root mean squared error of the prediction is: 25088.035958488646
Mean absolute percentage error of the prediction is: 9.233630379305428


In [5]:
#running some knn tests with differing numbers of neighbors and weighted differently
knn_test(10, None)

K-Nearest Neighbors Regression
10 neighbors and None weighted
R-squared: 0.65 (+/- 0.10)

Mean absolute error of the prediction is: 26739.378972602743
Mean squared error of the prediction is: 1.742842e+09
Root mean squared error of the prediction is: 41747.3625447189
Mean absolute percentage error of the prediction is: 15.57862197347441


In [6]:
knn_test(5, None)

K-Nearest Neighbors Regression
5 neighbors and None weighted
R-squared: 0.66 (+/- 0.11)

Mean absolute error of the prediction is: 24217.634794520545
Mean squared error of the prediction is: 1.375583e+09
Root mean squared error of the prediction is: 37088.85362082171
Mean absolute percentage error of the prediction is: 14.096565530268576


In [7]:
knn_test(10, "distance")

K-Nearest Neighbors Regression
10 neighbors and distance weighted
R-squared: 0.66 (+/- 0.11)

Mean absolute error of the prediction is: 13.561643835616438
Mean squared error of the prediction is: 4.833562e+04
Root mean squared error of the prediction is: 219.8536250289182
Mean absolute percentage error of the prediction is: 0.009693321337547389


In [8]:
knn_test(5, 'distance')

K-Nearest Neighbors Regression
5 neighbors and distance weighted
R-squared: 0.66 (+/- 0.11)

Mean absolute error of the prediction is: 13.561643835616438
Mean squared error of the prediction is: 4.833562e+04
Root mean squared error of the prediction is: 219.8536250289182
Mean absolute percentage error of the prediction is: 0.009693321337547389


Based on the R-squared and secondary test values, I would use the 5 neighbor with no weights model over the rest of the KNNs but would use the OLS model over any of the tested KNNs. While the distance weighted models have significantly smaller values for all of the secondary tests, the fact that they are so low lead me to believe that the model is drastically overfitting. I tried running a split using test and train but it didn't work.

The data has more columns with dummy values than with distinct values. As an example, 'on paved road' has only two values (0 or 1) that relate to values ranging from 34,900 to 755,000 in 'saleprice', so it can be difficult to find proper neighbors, especialy when compared to OLS being able to apply each variable as a coefficient.