### Lasso Variable Importance Investigation

In [1]:
''' Import libraries '''

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import scatter_matrix
import seaborn as sns   

from sklearn.pipeline import Pipeline   
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer  
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import KFold  
from sklearn.model_selection import cross_val_score  
from sklearn.model_selection import GridSearchCV    
from sklearn.model_selection import RandomizedSearchCV  

from sklearn.linear_model import LinearRegression  
from sklearn.linear_model import Lasso 
from sklearn.linear_model import Ridge   
from sklearn.linear_model import ElasticNet 

pd.set_option('display.max_columns', 900)   
pd.set_option('display.max_rows', 900)

%matplotlib inline
#%config InlineBackend.figure_format = 'svg'   

#### Run train test split and get back formatted X_train and y_train

In [2]:
from Linear_Model_Creation_Function import OLS_Model_Creation
lin_reg, X_train, X_test, y_train, y_test = OLS_Model_Creation(path = './data/kaggle/created/homes_grouped.csv', include_variables = 'num_and_upgradable') 

#### Function that outputs ('Value of Alpha', ['List of most important variables'], 'R^2 value')

In [10]:
def variable_importance(X_train, y_train, alphas, num_variables = 5, values = False, ignore_list = False, ignore_list_names = None):
    
    '''
    This function takes X_train and y_train data and runs lasso regression for a variety of variables.
    The function outputs a list of nested lists. 
    The nested list has ('Value of Alpha', ['List of most important variables'], 'R^2 value')
    '''
    
    from sklearn.linear_model import Lasso 
    
    lasso  = Lasso()
    lasso.set_params(normalize=True)
    coefs_lasso = []
    score_lasso = []
    
    # fit a bunch of lasso models based on differing alphas. Store coefficients and R^2 scores along the way
    for alpha in alphas:
            lasso.set_params(alpha=alpha)
            lasso.fit(X_train, y_train)  
            coefs_lasso.append(lasso.coef_)
            score_lasso.append(lasso.score(X_train, y_train))

    coefs_lasso = pd.DataFrame(coefs_lasso, index = alphas, columns = X_train.columns) 
    
    # if we have decided to keep some variables and want to look at the relative importance of the rest...
    if ignore_list == True:
        # get list of variables we want to look at
        Full_list = coefs_lasso.columns.to_list()
        To_ignore_list = ignore_list_names
        Non_ignored_list = [x for x in Full_list if not x in To_ignore_list or To_ignore_list.remove(x)]
        coefs_lasso = coefs_lasso.transpose().abs()
        # filter list on these variables
        coefs_lasso = coefs_lasso.loc[Non_ignored_list]
    else:
        coefs_lasso = coefs_lasso.transpose().abs()
          
    variable_importance_list = []

    for colname in coefs_lasso.columns:
        
        intermediate = coefs_lasso[[colname]].sort_values(by = colname, ascending = False).head(num_variables)
        
        # store alpha as string
        string_alpha = 'Alpha = ' + str(colname)
        
        # store R^2 value as string for this alpha
        idx = coefs_lasso.columns.get_loc(colname)
        string_R_Squared = 'R^2 = ' + str(round(score_lasso[idx], 3))
        
        # store list of top # of variable names for this iteration of alpha
        var_names = []
        for row in intermediate.index: 
            var_names.append(row)
        
        # create list of all important information for each iteration of alpha
        if values == True:
            variable_importance_list.append([string_alpha, var_names, string_R_Squared, intermediate[colname].values.tolist()]) 
        else:
            variable_importance_list.append([string_alpha, var_names, string_R_Squared])
    
#     print(coefs_lasso)
    return variable_importance_list

#### Get list of upgradable variables and find the variable names we wish to ignore

#### Checking to see type of upgradable variables

In [4]:
from read_path_module import read_data_relative_path

cost_of_improvements = read_data_relative_path(relative_dataset_path = './data/kaggle/created/cost_of_improvements_V2.csv',
                        data_type='csv'
                       )

upgradable_variables = cost_of_improvements[cost_of_improvements.Upgradable == 'y']['Variable'].to_list()

numerical_list = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
categorical_list = ['Alley', 'BldgType_group', 'BsmtCond_group', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual_group', 'CentralAir', 'Condition1_group', 'Electrical_group', 'ExterCond_group', 'ExterQual', 'Exterior1st_group', 'Exterior2nd_group', 'Fence', 'FireplaceQu', 'Foundation_group', 'GarageCond_group', 'GarageFinish', 'GarageQual', 'GarageType', 'HeatingQC_group', 'HouseStyle_group', 'KitchenQual', 'LandContour_group', 'LandSlope', 'LotConfig_group', 'LotShape_group', 'MS_Zoning_group', 'MasVnrType_group', 'Neighborhood', 'PavedDrive', 'RoofStyle_group', 'SaleCondition_group', 'SaleType_group']

upgradable_categorical = list(set(upgradable_variables) & set(categorical_list))
upgradable_numerical = list(set(upgradable_variables) & set(numerical_list))

print(len(upgradable_categorical))
print(len(upgradable_numerical))
print(len(upgradable_variables))


14
15
29


In [5]:
from read_path_module import read_data_relative_path

cost_of_improvements = read_data_relative_path(relative_dataset_path = './data/kaggle/created/cost_of_improvements.csv',
                        data_type='csv'
                       )

upgradable_variables = cost_of_improvements[cost_of_improvements.Upgradable == 'y']['Variable'].to_list()

def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 

ignore_list = Diff( X_train.columns.to_list(), upgradable_variables);
ignore_list;

#### Define alphas and variables we wish to ignore. Then run function.

In [6]:
def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 

# ignore_list = Diff( X_train.columns.to_list(), upgradable_variables);
# ignore_list = Diff( X_train.columns.to_list(), upgradable_numerical)
ignore_list = Diff( X_train.columns.to_list(), upgradable_variables)
# ignore_list = Diff( upgradable_variables, upgradable_numerical)
ignore_list.sort()
ignore_list
print(len(ignore_list))
print(len(upgradable_variables))
print(len(upgradable_numerical))
print(len(upgradable_categorical))


60
34
15
14


In [11]:
ignore_list = Diff( X_train.columns.to_list(), upgradable_variables);

# Defining inputs
alphas = np.linspace(0.00001,1000,200)
numerical_list = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
categorical_list = ['Alley', 'BldgType_group', 'BsmtCond_group', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual_group', 'CentralAir', 'Condition1_group', 'Electrical_group', 'ExterCond_group', 'ExterQual', 'Exterior1st_group', 'Exterior2nd_group', 'Fence', 'FireplaceQu', 'Foundation_group', 'GarageCond_group', 'GarageFinish', 'GarageQual', 'GarageType', 'HeatingQC_group', 'HouseStyle_group', 'KitchenQual', 'LandContour_group', 'LandSlope', 'LotConfig_group', 'LotShape_group', 'MS_Zoning_group', 'MasVnrType_group', 'Neighborhood', 'PavedDrive', 'RoofStyle_group', 'SaleCondition_group', 'SaleType_group']

# Running function
variable_importance(X_train, y_train, alphas=alphas, num_variables=30, values=True, ignore_list=False, ignore_list_names=ignore_list)

  positive)


[['Alpha = 1e-05',
  ['GarageQual_Po',
   'GarageQual_Fa',
   'FireplaceQu_Fa',
   'GarageQual_TA',
   'FireplaceQu_Po',
   'FireplaceQu_No_fireplace',
   'GarageFinish_No_garage',
   'FireplaceQu_TA',
   'GarageType_BuiltIn',
   'FireplaceQu_Gd',
   'GarageQual_Gd',
   'KitchenQual_TA',
   'KitchenQual_Fa',
   'GarageType_Basment',
   'GarageType_Attchd',
   'KitchenQual_Gd',
   'GarageType_No_garage',
   'GarageType_Detchd',
   'GarageType_CarPort',
   'OverallQual',
   'BsmtFinType2_BLQ',
   'FullBath',
   'BsmtFinType2_LwQ',
   'BsmtFinType2_Rec',
   'HalfBath',
   'BsmtFinType1_GLQ',
   'BsmtFullBath',
   'BsmtCond_group_No_basement',
   'BsmtFinType2_Unf',
   'Fence_GdWo'],
  'R^2 = 0.826',
  [56923.240505330636,
   51114.91728116786,
   49944.57065933938,
   48280.51304774749,
   47646.72315458001,
   46477.8763336383,
   44032.39362639694,
   40682.79501744183,
   36629.873528339245,
   36513.25018572405,
   36510.051072037095,
   32541.429400964524,
   26867.99721359458,
   26

In [8]:
upgradable_variables

['RoofStyle',
 'RoofMatl',
 'BsmtCond',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'KitchenQual',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [9]:
upgradable_categorical

['BsmtFinType2',
 'BsmtCond_group',
 'GarageQual',
 'PavedDrive',
 'RoofStyle_group',
 'BsmtFinType1',
 'HeatingQC_group',
 'KitchenQual',
 'GarageType',
 'FireplaceQu',
 'GarageFinish',
 'CentralAir',
 'Fence',
 'GarageCond_group']