In [102]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [82]:
# load the parsed dataset
df = pd.read_csv('ames_final.csv')

In [83]:
# train test split, cross validation is also used for all of the models
X = df.drop(['saleprice'], axis = 1)
y = df['saleprice']
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.3, random_state = 21)

In [84]:
# Scale the features
ss = StandardScaler()
xtrain = ss.fit_transform(xtrain)
xtest = ss.transform(xtest)

In [99]:
# Simple LR
lr = LinearRegression(n_jobs = -1)
lr.fit(xtrain, ytrain)
print(lr.score(xtrain, ytrain))
print(lr.score(xtest, ytest))
print(-cross_val_score(estimator = lr, X = X, y = y, cv = 6, scoring = 'neg_mean_squared_error').mean())
print(mean_squared_error(ytest, lr.predict(xtest), squared = False))

0.9431963248510259
-2.0036693317325955e+19
0.017720348662745092
1740687029.433621


In [266]:
ridge = GridSearchCV(Ridge(), param_grid = {'alpha': np.logspace(0, 5, 200), 'random_state' : [21]}, cv = 6)
ridge.fit(xtrain, ytrain)
be = ridge.best_estimator_
print(r2_score(ytrain, be.predict(xtrain)))
print(r2_score(ytest, be.predict(xtest)))
print(mean_squared_error(ytest, be.predict(xtest), squared = False))

0.9305391774259058
0.8978160255120027
0.12430799542836869


In [265]:
lasso_params = {'alpha' : np.arange(0.001, 1.0, 0.005), 'random_state': [21]}
lasso = GridSearchCV(Lasso(), param_grid = lasso_params, cv = 6)
lasso.fit(xtrain, ytrain)
be = lasso.best_estimator_
print(r2_score(ytrain, be.predict(xtrain)))
print(r2_score(ytest, be.predict(xtest)))
print(mean_squared_error(ytest, be.predict(xtest), squared = False))

0.9245079816017954
0.8973942133992465
0.12456430087032995


In [264]:
eln_params = {'alpha': np.arange(0.01, 10.0, 0.05), 'l1_ratio': np.arange(0.01, 1.0, 0.05)}
eln = GridSearchCV(ElasticNet(), param_grid = eln_params, cv = 6)
eln.fit(xtrain, ytrain)
be = eln.best_estimator_
print(r2_score(ytrain, be.predict(xtrain)))
print(r2_score(ytest, be.predict(xtest)))
print(mean_squared_error(ytest, be.predict(xtest), squared = False))

0.9254875259545158
0.897997494527861
0.12419756679292387


In [263]:
rfr = RandomForestRegressor(n_estimators = 5000, random_state = 21)
rfr.fit(xtrain, ytrain)
print(r2_score(ytrain, rfr.predict(xtrain)))
print(r2_score(ytest, rfr.predict(xtest)))
print(mean_squared_error(ytest, rfr.predict(xtest), squared = False))

0.9819349372413865
0.8636311093824526
0.14360359805926334


In [121]:
dtr = DecisionTreeRegressor(random_state = 21)
dtr.fit(xtrain, ytrain)
print(r2_score(ytrain, dtr.predict(xtrain)))
print(r2_score(ytest, dtr.predict(xtest)))
print(mean_squared_error(ytest, dtr.predict(xtest), squared = False))

0.9999999971466298
0.7495706858420178
0.19460332207342146
