## Set up data set

In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

housing = clean(config.HOUSING_CSV)
housing = add_features(housing)

## Define linear regression function

In [2]:
import statsmodels.api as sm
def ols(X, y, print_output=False):
    X2 = sm.add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=.2, random_state=42)
    reg = sm.OLS(y_train, X_train.astype(float)).fit()
    if print_output:
        print(reg.summary())
        rss = np.sum((y_test-reg.predict(X_test))**2)
        tss = np.sum((y_test-y_test.mean())**2)
        print(f'Test R2: {1-(rss/tss)}')
    return reg

## Create base data set

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
X = housing[config.CONTINUOUS_VARIABLES].copy()
dummy_df = dummify(housing, config.VARS_TO_DUMMIFY, drop_first=False)
X = pd.concat([X,dummy_df], axis=1)
y = np.log(housing['SalePrice'])

In [4]:
X_subset = X.loc[:, config.LASSO_VARS_FROM_ALL]
reg_all = ols(X_subset, y, True)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.904
Model:                            OLS   Adj. R-squared:                  0.903
Method:                 Least Squares   F-statistic:                     797.4
Date:                Sun, 07 Mar 2021   Prob (F-statistic):               0.00
Time:                        19:58:10   Log-Likelihood:                 1449.4
No. Observations:                2063   AIC:                            -2849.
Df Residuals:                    2038   BIC:                            -2708.
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               2.5675      0.366     

## Create data set from chosen variables

In [5]:
X_subset = X.loc[:, config.LASSO_VARS_FROM_CHOSEN]
X_subset.drop('Showers', axis=1, inplace=True) # Dropped because of multi-collinearity
reg_chosen = ols(X_subset, y, True)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.870
Model:                            OLS   Adj. R-squared:                  0.869
Method:                 Least Squares   F-statistic:                     686.0
Date:                Sun, 07 Mar 2021   Prob (F-statistic):               0.00
Time:                        19:58:10   Log-Likelihood:                 1142.8
No. Observations:                2063   AIC:                            -2244.
Df Residuals:                    2042   BIC:                            -2125.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              10.4162      0.024    4

In [6]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y, reg_chosen.predict(sm.add_constant(X_subset)), squared=False)
print(rms)

0.1380362008350062
