In [1]:
#import the necessities
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import warnings
from scipy.stats import bartlett, levene
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sqlalchemy import create_engine
from statsmodels.tools.eval_measures import mse, rmse
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
#create the authorization variables
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
houses_db = 'houseprices'

In [3]:
#create and dispose of engine
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, houses_db))

houseprices_df = pd.read_sql_query('SELECT * FROM houseprices', con=engine)

engine.dispose()

In [4]:
#we'll define a function here that will allow us to test quickly
#below to tune to the best parameters and test after
def tuning():
    y = info_df['saleprice']
    X = info_df.drop(['saleprice'], axis=1)
    X = sm.add_constant(X)
    results = sm.OLS(y, X).fit()
    print(results.summary())

def lrm_test():
    y = info_df['saleprice']
    X = info_df.drop(['saleprice'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=357)
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_train_predictions = lr.predict(X_train)
    y_test_predictions = lr.predict(X_test)
    print('OLS Regression')
    print("R-squared of the model in the training set is: {}".format(lr.score(X_train, y_train)))
    print("R-squared of the model in the test set is: {}".format(lr.score(X_test, y_test)))
    print("\nMean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_test_predictions)))
    print("Mean squared error of the prediction is: {:3e}".format(mse(y_test, y_test_predictions)))
    print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_test_predictions)))
    print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_test_predictions) / y_test)) * 100))
    
def lasso_test():
    y = info_df['saleprice']
    X = info_df.drop(['saleprice'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=357)
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
    lr = LassoCV(cv=10)
    lr.fit(X_train, y_train)
    y_train_predictions = lr.predict(X_train)
    y_test_predictions = lr.predict(X_test)
    print('Lasso Regression')
    print("R-squared of the model in the training set is: {}".format(lr.score(X_train, y_train)))
    print("R-squared of the model in the test set is: {}".format(lr.score(X_test, y_test)))
    print("\nMean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_test_predictions)))
    print("Mean squared error of the prediction is: {:3e}".format(mse(y_test, y_test_predictions)))
    print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_test_predictions)))
    print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_test_predictions) / y_test)) * 100))

def ridge_test():
    y = info_df['saleprice']
    X = info_df.drop(['saleprice'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=357)
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
    lr = RidgeCV(cv=10)
    lr.fit(X_train, y_train)
    y_train_predictions = lr.predict(X_train)
    y_test_predictions = lr.predict(X_test)
    print('Ridge Regression')
    print("R-squared of the model in the training set is: {}".format(lr.score(X_train, y_train)))
    print("R-squared of the model in the test set is: {}".format(lr.score(X_test, y_test)))
    print("\nMean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_test_predictions)))
    print("Mean squared error of the prediction is: {:3e}".format(mse(y_test, y_test_predictions)))
    print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_test_predictions)))
    print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_test_predictions) / y_test)) * 100))
    
def en_test():
    y = info_df['saleprice']
    X = info_df.drop(['saleprice'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=357)
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)
    lr = ElasticNetCV(cv=10)
    lr.fit(X_train, y_train)
    y_train_predictions = lr.predict(X_train)
    y_test_predictions = lr.predict(X_test)
    print('Elastic Net Regression')
    print("R-squared of the model in the training set is: {}".format(lr.score(X_train, y_train)))
    print("R-squared of the model in the test set is: {}".format(lr.score(X_test, y_test)))
    print("\nMean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_test_predictions)))
    print("Mean squared error of the prediction is: {:3e}".format(mse(y_test, y_test_predictions)))
    print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_test_predictions)))
    print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_test_predictions) / y_test)) * 100))

The tuning can be found at [https://github.com/CasualChemist/dsf_data/blob/main/2021-01-17%20Aaron%20Reed%20DSF%20Challenge%20(Regressions)%20Tests.ipynb]. For sake of space and to not rewrite the code, I will call the final model directly.
Once it has been called, I'll run the tuning test and show the results. Afterwards, I will run the various types of regressions with a CV of 10 (when applicable) and print their stats for comparison.

In [5]:
info_df = pd.read_csv('D:\DSF\houseprices_model.csv', sep='\t', header=0)

In [6]:
tuning()

                            OLS Regression Results                            
Dep. Variable:              saleprice   R-squared:                       0.900
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     214.0
Date:                Wed, 20 Jan 2021   Prob (F-statistic):               0.00
Time:                        17:51:05   Log-Likelihood:                -16862.
No. Observations:                1460   AIC:                         3.384e+04
Df Residuals:                    1400   BIC:                         3.416e+04
Df Model:                          59                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                  -1.24

In [7]:
#calling the function for the OLS Regression
lrm_test()

OLS Regression
R-squared of the model in the training set is: 0.9175940399135784
R-squared of the model in the test set is: 0.7845623690347873

Mean absolute error of the prediction is: 15827.69367927762
Mean squared error of the prediction is: 1.071772e+09
Root mean squared error of the prediction is: 32737.93509742331
Mean absolute percentage error of the prediction is: 9.687289969495124


In [8]:
#calling the function for the Ridge Regression
ridge_test()

Ridge Regression
R-squared of the model in the training set is: 0.8857570918687354
R-squared of the model in the test set is: 0.8240016647831889

Mean absolute error of the prediction is: 15656.79613136504
Mean squared error of the prediction is: 8.755674e+08
Root mean squared error of the prediction is: 29589.987445924755
Mean absolute percentage error of the prediction is: 9.467783700230987


In [9]:
#calling the function for the Lasso Regression
lasso_test()

Lasso Regression
R-squared of the model in the training set is: 0.7176574297095706
R-squared of the model in the test set is: 0.6457417755828971

Mean absolute error of the prediction is: 24925.29258437263
Mean squared error of the prediction is: 1.762386e+09
Root mean squared error of the prediction is: 41980.77693584046
Mean absolute percentage error of the prediction is: 15.019875404098535


In [10]:
#calling the function for the Elastic Net Regression
en_test()

Elastic Net Regression
R-squared of the model in the training set is: 0.6168978264738623
R-squared of the model in the test set is: 0.561674909142513

Mean absolute error of the prediction is: 30944.21519302323
Mean squared error of the prediction is: 2.180607e+09
Root mean squared error of the prediction is: 46696.96624660817
Mean absolute percentage error of the prediction is: 19.48570360643981


Based on the results of the test, I would use Ridge Regression. While OLS has a higher R-squared value for the training set, the difference between training and testing set for Ridge is lower. Also, all of the secondary stats for Ridge are more preferable.

In the final part of the testing page, I added in mortgage rates to see if it would have an impact on the model; it did not. It's possible that if I added it before doing the tuning, it could have been made significant. Another thing to note is that the data comes from a different source and therefore its validity could be called into question.