In [1]:
#import the necessities
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import warnings
from scipy.stats import bartlett, levene
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sqlalchemy import create_engine
from statsmodels.tools.eval_measures import mse, rmse
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
#create the authorization variables
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
houses_db = 'houseprices'

In [3]:
#create and dispose of engine
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, houses_db))

houseprices_df = pd.read_sql_query('SELECT * FROM houseprices', con=engine)

engine.dispose()

In [4]:
#create the datasets from the previous checkpoint
y = houseprices_df['saleprice']
X = houseprices_df[['lotarea', 'overallqual', 'yearbuilt', 'poolarea']]

In [5]:
#create the training set and the testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=61)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

In [6]:
#run a linear regression and fit it
lrm = LinearRegression()
lrm.fit(X_train, y_train)
#then make predictions
y_pred_train = lrm.predict(X_train)
y_pred_test = lrm.predict(X_test)

In [7]:
#get the values of interest
print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("\nR-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((
    y_test - y_pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.6680579386771139

R-squared of the model in the test set is: 0.6667808483018217
Mean absolute error of the prediction is: 32774.36571199548
Mean squared error of the prediction is: 2319691179.152048
Root mean squared error of the prediction is: 48163.17243654168
Mean absolute percentage error of the prediction is: 18.66368225311517


In [8]:
#run a lasso regression and cross validate
lassoreg = LassoCV(cv=10)
lassoreg.fit(X_train, y_train)
#make predictions
lasso_y_pred_train = lassoreg.predict(X_train)
lasso_y_pred_test = lassoreg.predict(X_test)

In [9]:
#print the stats
print("R-squared of the model in the training set is: {}".format(lassoreg.score(X_train, y_train)))
print("\nR-squared of the model in the test set is: {}".format(lassoreg.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, lasso_y_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, lasso_y_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, lasso_y_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((
    y_test - lasso_y_pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.34057778952559703

R-squared of the model in the test set is: 0.2984880457605359
Mean absolute error of the prediction is: 46644.43493012114
Mean squared error of the prediction is: 4883546110.797858
Root mean squared error of the prediction is: 69882.373391277
Mean absolute percentage error of the prediction is: 26.456603146280916


In [10]:
#ridge regression time
ridgereg = RidgeCV(cv=10)
ridgereg.fit(X_train, y_train)
#make predictions
ridge_y_pred_train = ridgereg.predict(X_train)
ridge_y_pred_test = ridgereg.predict(X_test)

In [11]:
#look at the stats
print("R-squared of the model in the training set is: {}".format(ridgereg.score(X_train, y_train)))
print("\nR-squared of the model in the test set is: {}".format(ridgereg.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, ridge_y_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, ridge_y_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, ridge_y_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((
    y_test - ridge_y_pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.668041430348904

R-squared of the model in the test set is: 0.6662555459531638
Mean absolute error of the prediction is: 32751.800154295575
Mean squared error of the prediction is: 2323348049.4680533
Root mean squared error of the prediction is: 48201.1208320725
Mean absolute percentage error of the prediction is: 18.61977152426953


In [12]:
#now for elastic net
enreg = ElasticNetCV(cv=10)
enreg.fit(X_train, y_train)
#make the needed predictions
en_y_pred_train = enreg.predict(X_train)
en_y_pred_test = enreg.predict(X_test)

In [13]:
#find the stats yet again
print("R-squared of the model in the training set is: {}".format(enreg.score(X_train, y_train)))
print("\nR-squared of the model in the test set is: {}".format(enreg.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, en_y_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, en_y_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, en_y_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((
    y_test - en_y_pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.0720212891795542

R-squared of the model in the test set is: 0.06408553354015056
Mean absolute error of the prediction is: 56144.56438765459
Mean squared error of the prediction is: 6515329389.753012
Root mean squared error of the prediction is: 80717.59033663612
Mean absolute percentage error of the prediction is: 34.00436028678142
