
##  Iowa House Price (*Base models*) 


###  Part 1.1: Load libraries and data info 

In [28]:
''' Import libraries '''

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import scatter_matrix
import seaborn as sns   

from sklearn.pipeline import Pipeline   
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer  
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import KFold  
from sklearn.model_selection import cross_val_score  
from sklearn.model_selection import GridSearchCV    
from sklearn.model_selection import RandomizedSearchCV  

from sklearn.linear_model import LinearRegression  
from sklearn.linear_model import Lasso 
from sklearn.linear_model import Ridge   
from sklearn.linear_model import ElasticNet 

from sklearn.tree import DecisionTreeRegressor    
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR   
from sklearn.ensemble import RandomForestRegressor  
from sklearn.ensemble import GradientBoostingRegressor   
from sklearn.ensemble import ExtraTreesRegressor 
from sklearn.ensemble import AdaBoostRegressor    
from xgboost import XGBRegressor  
from sklearn import metrics   
from sklearn.metrics import mean_squared_error  

pd.set_option('display.max_columns', 900)   
pd.set_option('display.max_rows', 900)

%matplotlib inline
#%config InlineBackend.figure_format = 'svg'   

#### Import df_train and formatted kaggle test dataset

In [3]:
from read_path_module import read_data_relative_path

# Manipulated dataset where similar columns are combined
df_train = read_data_relative_path(relative_dataset_path = './data/kaggle/created/homes_grouped.csv',
                        data_type='csv')

from Kaggle_Test_Preprocessing import pre_processing

X_test_kaggle = pre_processing(path = './data/kaggle/created/homes_grouping_testdata.csv',
                                                  include_variables = 'num_and_chosen_upgradable')


In [4]:
from Linear_Model_Creation_Function import OLS_Model_Creation

lin_reg, X_train, y_train  = OLS_Model_Creation(path = './data/kaggle/created/homes_grouped.csv',
                                                include_variables='num_and_chosen_upgradable') 

#### +++ R Squared score for multiple linear regression model +++


In [12]:
SCORE_ols = lin_reg.score(X_train, y_train)
print('OLS R^2 =',SCORE_ols)

OLS R^2 = 0.8131984225773061


####  Standardize the features 

In [6]:
''' Standardarize the dependent variables X '''

''' Scaling the data set '''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_standardized = scaler.fit_transform(X_train)     # Note this is .fit_transform

#### +++ Ridge and lasso scores +++

In [37]:

lasso = Lasso()
ridge = Ridge()  

elasticNet = ElasticNet()   

params_eNet = [ {'alpha':[0.001, 0.01, 0.1, 0.5, 1, 10, 100]}, 
               {'l1_ratio':[0, 0.25, 0.5, 0.75, 1]}]

params_lasso = [ {'alpha':[1e-4, 1e-2, 0.1, 1, 2, 20, 30]} ]
params_ridge = [ {'alpha':[1e-4, 1e-2, 0.1, 1, 2, 10, 20]} ] 

kfold = KFold(n_splits = 5, shuffle = True)


grid_lasso = GridSearchCV(estimator = lasso, param_grid = params_lasso, cv = kfold)
grid_ridge = GridSearchCV(estimator = ridge, param_grid = params_ridge, cv = kfold)
grid_elasticNet = GridSearchCV(estimator = elasticNet, param_grid = params_eNet, cv = kfold)

grid_lasso.fit(X_train, y_train)   
grid_ridge.fit(X_train, y_train) 
grid_elasticNet.fit(X_train, y_train)  

SCORE_lasso = grid_lasso.best_score_

SCORE_ridge = grid_ridge.best_score_
SCORE_elastic = grid_elasticNet.best_score_

print('Lasso R^2 =',SCORE_lasso)
print('Ridge R^2 =',SCORE_ridge)
print('Elastic R^2 =',SCORE_elastic)


Lasso R^2 = 0.8053982356818056
Ridge R^2 = 0.7940707952911531
Elastic R^2 = 0.7997932488122184


  positive)
  positive)
  positive)
  positive)
  positive)


###  +++ Random forest regressor score +++      

In [25]:

# define random forest regressor  
randomForest = RandomForestRegressor()   

# set kfold and hyper-parameter range    
kfold = KFold(n_splits = 5, shuffle = True)
params_randomForest = [{'n_estimators': [100, 300, 500]},  
                       {'max_depth': [10, 20, 30, 40]}, 
                       {'min_samples_split': [2, 5, 10]}, 
                       {'min_samples_leaf': [1, 2, 5]}]


# grid search 
grid_randomForest = GridSearchCV(estimator = randomForest, param_grid = params_randomForest, cv = kfold);

# fit model 
grid_randomForest.fit(X_train, y_train);    

SCORE_random_forest = grid_randomForest.score(X_train, y_train)

print('Random Forest R^2 =',SCORE_random_forest)


Random Forest R^2 = 0.9762500517541796


### +++ Support vector regressor score +++  

In [26]:

# define random forest regressor  
svr = SVR()   

# set kfold and hyper-parameter range    
kfold = KFold(n_splits = 5, shuffle = True)
params_svr = [{'kernel': ['rbf', 'linear']}]
              

# grid search 
grid_svr = GridSearchCV(estimator = svr, param_grid = params_svr, cv = kfold)       

# fit model 
grid_svr.fit(X_train, y_train)  

SCORE_svr = grid_svr.score(X_train, y_train)

print('SVR R^2 =',SCORE_svr)


SVR R^2 = 0.6106066551441983


###  +++ XG Boost regressor score +++

In [29]:

# define XGB regressor  
xgb_reg = XGBRegressor(objective = 'reg:squarederror')    

# set kfold and hyper-parameter range    
kfold = KFold(n_splits = 5, shuffle = True)
params_xgb_reg = [{'objective': ['reg:squarederror']}, 
                  {'colsample_bytree': [0.30]}, 
                  {'learning_rate': [0.1, 0.01]}, 
                  {'max_depth': [5, 10, 15]},
                  {'alpha': [10]}, 
                  {'n_estimators': [10]}]    
              

# grid search 
grid_xgb_reg = GridSearchCV(estimator = xgb_reg, param_grid = params_xgb_reg, cv = kfold)       

# fit model 
grid_xgb_reg.fit(X_train, y_train)  

SCORE_xg_boost = grid_xgb_reg.score(X_train, y_train)

print('XG Boost R^2 =',SCORE_xg_boost)


XG Boost R^2 = 0.9119072686947445


In [31]:
from Kaggle_Test_Preprocessing import pre_processing

X_test_kaggle = pre_processing(path = './data/kaggle/created/homes_grouping_testdata.csv',
                                                  include_variables = 'num_and_chosen_upgradable')

In [34]:
y_predict_kaggle = grid_xgb_reg.predict(X_test_kaggle)

Sample_sbumission = read_data_relative_path(relative_dataset_path = './data/kaggle/sample_submission.csv',
                        data_type='csv'
                       )

ID_kaggle = Sample_sbumission['Id']
ID_kaggle

Listing_Agents_Submission = pd.DataFrame(ID_kaggle)
Listing_Agents_Submission['SalePrice'] = y_predict_kaggle
Listing_Agents_Submission.to_csv('XG_Boost_Submission.csv', index=False)

In [35]:
y_predict_kaggle = grid_randomForest.predict(X_test_kaggle)

Sample_sbumission = read_data_relative_path(relative_dataset_path = './data/kaggle/sample_submission.csv',
                        data_type='csv'
                       )

ID_kaggle = Sample_sbumission['Id']
ID_kaggle

Listing_Agents_Submission = pd.DataFrame(ID_kaggle)
Listing_Agents_Submission['SalePrice'] = y_predict_kaggle
Listing_Agents_Submission.to_csv('randomForest_Submission.csv', index=False)

#### Summary of all the models in this notebook:
- Multiple linear regression [ lin_reg ]
- Ridge [ grid_ridge ]
- Lasso [ grid_lasso ]
- Elastic net [ grid_elasticNet ]
- Random forest [ grid_randomForest ]
- Support vector regressor [ grid_svr ]
- XG Boost Regressor [ grid_xgb_reg ]

#### Each of the above models were trained on X_train & y_train

#### We want to assemble a table of R^2 per model and to extract p-value significance for each variable in each model
- To do so we will create list of model names,
- Evaluate each score on X_train and y_train
- Separately come up with function to perform stats module actions
- Create for loop to horizontally merge p value columns from each model

#### Nested for loop to fit models based on y_train or log of y_train and to save the scores

In [250]:

model_names = ['lin_reg', 'grid_ridge', 'grid_lasso', 'grid_elasticNet', 'grid_randomForest', 'grid_svr'] 
#model_names = ['lin_reg', 'grid_ridge', 'grid_svr'] 
model_trans = ['y_train']

for y_vals in model_trans:
    
    model_scores = []

    for model in model_names:
        if model == 'lin_reg':
            eval(model+'.fit(X_train,'+y_vals+')')
            score = eval(model+'.score(X_train, y_train)')
            model_scores.append(round(score,4))
            print('Completed fitting and scoring the ', model, ' model with ', y_vals, ' data.')
        else:
            eval(model+'.fit(X_train,'+y_vals+')')
            score = eval(model+'.best_score_')
            model_scores.append(round(score,4))
            print('Completed fitting and scoring the ', model, ' model with ', y_vals, ' data.')
        
    if y_vals == 'y_train':
        No_Transformation_DF = pd.DataFrame({'Model': model_names, 'R^2': model_scores})
        No_Transformation_DF = No_Transformation_DF.set_index('Model')
        No_Transformation_DF = No_Transformation_DF.rename(columns={"R^2": "R^2 y_train"})
    else:
        Log_Transformation_DF = pd.DataFrame({'Model': model_names, 'R^2': model_scores})
        Log_Transformation_DF = Log_Transformation_DF.set_index('Model')
        Log_Transformation_DF = Log_Transformation_DF.rename(columns={"R^2": "R^2 log(y_train)"})

        
Score_df = pd.concat([No_Transformation_DF, Log_Transformation_DF], axis=1, sort=False)
Score_df


Completed fitting and scoring the  lin_reg  model with  y_train  data.
Completed fitting and scoring the  grid_ridge  model with  y_train  data.


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Completed fitting and scoring the  grid_lasso  model with  y_train  data.


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Completed fitting and scoring the  grid_elasticNet  model with  y_train  data.
Completed fitting and scoring the  grid_randomForest  model with  y_train  data.
Completed fitting and scoring the  grid_svr  model with  y_train  data.
Completed fitting and scoring the  lin_reg  model with  np.log(y_train)  data.
Completed fitting and scoring the  grid_ridge  model with  np.log(y_train)  data.


  positive)
  positive)
  positive)
  positive)


Completed fitting and scoring the  grid_lasso  model with  np.log(y_train)  data.


  positive)
  positive)
  positive)
  positive)
  positive)


Completed fitting and scoring the  grid_elasticNet  model with  np.log(y_train)  data.
Completed fitting and scoring the  grid_randomForest  model with  np.log(y_train)  data.
Completed fitting and scoring the  grid_svr  model with  np.log(y_train)  data.


Unnamed: 0_level_0,R^2 y_train,R^2 log(y_train)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
lin_reg,0.9123,-6.0512
grid_ridge,0.8069,0.8659
grid_lasso,0.8239,0.8593
grid_elasticNet,0.8503,0.859
grid_randomForest,0.8527,0.8688
grid_svr,0.175,0.8462


In [106]:
# Function to get printout of OLS statistical significance

def R_stats_printout(X_data, y_data, transformation_method = 'linear'):
    '''
    Transformation method can be linear or log transformation
    '''
    if transformation_method == 'linear':
        import statsmodels.api as sm
        x = sm.add_constant(X_data)
        model = sm.OLS(y_data, x)
        results = model.fit()
        print(results.summary())
    else:
        import statsmodels.api as sm
        x = sm.add_constant(X_data)
        model = sm.OLS(np.log(y_data), x)
        results = model.fit()
        print(results.summary())
    

R_stats_printout(X_train, y_train)

def R_stats_P_values(X_data, y_data, transformation_method = 'linear'):
    '''
    Transformation method can be linear or log transformation
    '''
    if transformation_method == 'linear':
        import statsmodels.api as sm
        x = sm.add_constant(X_data)
        model = sm.OLS(y_data, x)
        results = model.fit()
        pValues = results.pvalues
        return pValues
    else:
        import statsmodels.api as sm
        x = sm.add_constant(X_data)
        model = sm.OLS(np.log(y_data), x)
        results = model.fit()
        pValues = results.pvalues
        return pValues

    
P_values = R_stats_P_values(X_data, y_data)
P_values



                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.934
Model:                            OLS   Adj. R-squared:                  0.917
Method:                 Least Squares   F-statistic:                     53.41
Date:                Wed, 27 May 2020   Prob (F-statistic):               0.00
Time:                        01:15:59   Log-Likelihood:                -13246.
No. Observations:                1168   AIC:                         2.698e+04
Df Residuals:                     922   BIC:                         2.823e+04
Df Model:                         245                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.821e+05    670.968    271.339      0.0

In [103]:
# Attempt at ridge and lasso regression... alas statsmodels is not a good package for this:
    # https://groups.google.com/forum/#!topic/pystatsmodels/nC_boeczVWo
    # https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.fit_regularized.html

x = sm.add_constant(X_train)
model = sm.OLS(y_train, x) #.fit_regularized(method='elastic_net', alpha=0.0, L1_wt=1.0)
results = model.fit_regularized()

print(results.summary())

NotImplementedError: 