In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [4]:
housing = clean(config.HOUSING_CSV)
housing = add_features(housing)

In [5]:
chosen_variables = ['LotFrontage', 'UnusedLotSize', 'HouseAge', 'HouseAgeSq', 
                    'OverallQual', 'OverallCond', 'GrLivArea', 'TotalLivingArea',
                   'Toilets','Showers','UpDownRatio',]
dummy_vars = {'Neighborhood':'Nbhd','LotConfig':'LC','SaleCondition':'SC'}

In [6]:
def lr(housing, regress_vars,dummy_dict):
    X = housing[regress_vars].copy()
    dummy_df = dummify(housing, dummy_dict)
    X = pd.concat([X,dummy_df], axis=1)
    y = np.log(housing['SalePrice'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
    reg = LinearRegression()
    folds = KFold(n_splits=10,shuffle=True, random_state=42)
    train_scores = cross_val_score(reg, X_train, y_train, scoring="neg_root_mean_squared_error", cv=folds)
    test_scores = cross_val_score(reg, X_test, y_test, scoring="neg_root_mean_squared_error", cv=folds)
    return -np.mean(train_scores), -np.mean(test_scores)

In [7]:
lr(housing, chosen_variables, dummy_vars)

(0.12654278117696172, 0.12224817497592233)

In [8]:
var2 = chosen_variables + ['GarageArea']
dummy_vars['BldgType'] = 'BT'
lr(housing,var2, dummy_vars)

(0.12067600848209145, 0.11947385826730055)

In [9]:
var3 = ['LotFrontage', 'UnusedLotSize', 'HouseAge',
                    'OverallQual', 'OverallCond', 'GrLivArea', 'TotalLivingArea',
                   'Toilets','Showers','UpDownRatio','GarageArea']
lr(housing, var3, dummy_vars)

(0.12116868461071617, 0.12138139827650041)

In [10]:
var4 = ['LotFrontage', 'UnusedLotSize', 'HouseAgeSq',
                    'OverallQual', 'OverallCond', 'GrLivArea', 'TotalLivingArea',
                   'Toilets','Showers','UpDownRatio','GarageArea']
lr(housing, var3, dummy_vars)

(0.12116868461071617, 0.12138139827650041)

In [11]:
dummy_vars2 = {'Neighborhood':'Nbhd','LotConfig':'LC','SaleCondition':'SC',
             'BldgType':'BT','Electrical':'Elec'}
lr(housing, var2, dummy_vars2)

(0.12083476869330505, 0.11943438236192369)

In [12]:
var5 = ['YearRemodAdd'] + var2
lr(housing, var5, dummy_vars)

(0.12079232664893247, 0.11951600651301435)

In [13]:
var6 = var2 + ['HasBsmt','HasPool']
lr(housing, var6, dummy_vars)

(0.11986933787062311, 0.11779731320674583)

# Best Results!

In [14]:
var7 = var2 + ['HasPool']
dummy_vars7 = {'Neighborhood':'Nbhd','LotConfig':'LC','SaleCondition':'SC',
               'BldgType':'BT','BsmtQual':'BQ'}
lr(housing, var7, dummy_vars7)

(0.119608937024965, 0.11683340919538028)

In [15]:
var8 = ['Fireplaces'] + var7
var8.remove('GrLivArea')
lr(housing, var8, dummy_vars7)

(0.1264639061937633, 0.12120562170211217)

## Additional Testing

In [16]:
var8.remove('LotFrontage')
var8.remove('Toilets')

In [17]:
dummy_vars8 = dummy_vars7.copy()
dummy_vars8['ExterQual'] = 'EQ'
lr(housing, var8, dummy_vars8)

(0.12656631468613694, 0.12115146656506329)

In [18]:
import statsmodels.api as sm
def ols(housing, regress_vars,dummy_dict):
    X = housing[regress_vars].copy()
    dummy_df = dummify(housing, dummy_dict)
    X = pd.concat([X,dummy_df], axis=1)
    X = sm.add_constant(X)
    y = np.log(housing['SalePrice'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
    #reg = sm.OLS(y_train, X_train)
    reg = sm.OLS(y_train, X_train.astype(float)).fit()
    #ans = reg.fit()
    print(reg.summary())

In [19]:
ols(housing,var8, dummy_vars8)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.900
Model:                            OLS   Adj. R-squared:                  0.897
Method:                 Least Squares   F-statistic:                     309.9
Date:                Mon, 01 Mar 2021   Prob (F-statistic):               0.00
Time:                        17:55:46   Log-Likelihood:                 1406.9
No. Observations:                2063   AIC:                            -2696.
Df Residuals:                    2004   BIC:                            -2363.
Df Model:                          58                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               5.3596      0.022    2