In [26]:
import sys
sys.path.append('../ames') # path the the directory

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.api import OLS
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import re
import itertools
import statistics
%matplotlib inline 

In [2]:
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression

In [4]:
import config
from data_prep import clean, add_features

In [5]:
housing_initial = clean(config.HOUSING_CSV)
housing = housing_initial.copy()
housing = add_features(housing)

In [6]:
lin_reg_cols = ['Toilets','WoodDeckSF','TotRmsAbvGrd','BedroomAbvGr','Showers',
                'FirePlaces','YearBuilt','YearRemodAdd',
                'TotalBsmtSF','DecadeBuilt'
               ]

cols_to_dummy = ['Foundation','RoofStyle','SaleCondition',
                 'Neighborhood','Electrical','HouseStyle','SaleType','MSSubClass',
                 'LandContour','PavedDrive','GarageFinish','MSZoning'
                ]

dummy_suffix = ['Qual','Cond','Type']
for col in housing.columns:
    if col not in cols_to_dummy and col not in lin_reg_cols:
        for suff in dummy_suffix:
            #print(re.search(r'%s$'%dummy_suffix, col))
            if re.search(r'%s$'%suff, col) is not None:
                cols_to_dummy.append(col)
                break

In [18]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression

def perform_lasso_ridge_test(set_df,test_type):
    if test_type == 'ridge':
        test = Ridge()
    elif test_type == 'lasso':
        test = Lasso(max_iter=10000)
    
    prices = set_df['SalePrice']
    del set_df['SalePrice']
        
    coefs = []
    intercepts = []
    R2_train  = []
    R2_test  = []
    alphaRange = [.0005,.0001,.01]
    X_train, X_test, y_train, y_test = train_test_split(set_df, prices, test_size=.2, random_state=42)
    
    for alpha in alphaRange:
        test.set_params(alpha=alpha)  
        test.fit(X_train,y_train)
        intercepts.append(test.intercept_)
        coefs.append(test.coef_)
        R2_train.append(test.score(X_train,y_train))
        R2_test.append(test.score(X_test,y_test))
    return (statistics.mean(R2_train),statistics.mean(R2_test))

In [19]:
def get_mll_vals(set_df):
    lm = LinearRegression()
    prices = set_df['SalePrice']
    del set_df['SalePrice']
    
    lm.fit(set_df,prices)

    r2 = lm.score(set_df, prices)
    return r2

In [20]:
def get_split_reg_split(df_dict_func,test_type='lasso'):
    output = {}
    for key,data_start in df_dict.items():
        data = data_start.copy()
        if test_type == 'lasso' or test_type == 'ridge':
            val = perform_lasso_ridge_test(data,test_type)
        elif test_type in ['mll','multiple_lin','multiple_linear','multiple']:
            val = get_mll_vals(data)
        output[key] = val
    return output

Split Based On Being Above/Below A Speicfic Value

In [27]:
# split_variable = 'SalePrice'
# reg_variables = ['Neighborhood','OverallQual','BedroomAbvGr','TotalBsmtSF','DecadeBuilt','TotRmsAbvGrd','KitchenQual']
# test_var = 'SalePrice'
# split_type = 'mean'

In [28]:
# if split_type == 'mean':
#     test_val = housing[split_variable].mean()
# elif split_type == 'median':
#     test_val = housing[split_variable].median()
# else:
#     test_val = split_type

# sub_df = housing.copy()
# sub_df = sub_df.fillna(0.0)
# sub_df = sub_df[reg_variables+['SalePrice']]

# dummy_cols = []
# lin_cols = []
# for col in sub_df.columns:
#     if col in cols_to_dummy:
#         dummy_cols.append(col)
#     else:
#         lin_cols.append(col)

# if len(dummy_cols)>0:
#     sub_df = pd.get_dummies(sub_df,columns=dummy_cols,drop_first=True)

# high_df = sub_df[sub_df[split_variable]>test_val]
# low_df = sub_df[sub_df[split_variable]<=test_val]

# df_dict = {'high':high_df,'low':low_df}

Split Breaking Out A Values Of 1 Dimension Separately

In [38]:
reg_variables = ['Neighborhood','OverallQual','BedroomAbvGr','TotalBsmtSF','DecadeBuilt','TotRmsAbvGrd','KitchenQual']
dimension = 'All'

df_dict = {}
if dimension == 'All':
    all_vals = ['All']
else:
    all_vals = housing[dimension].unique()

for val in all_vals:
    temp = housing.copy()
    if dimension != 'All':
        temp = temp[temp[dimension]==val]
    
    temp = temp[reg_variables+['SalePrice']]
    
    dummy_cols = []
    lin_cols = []
    for col in temp.columns:
        if col in cols_to_dummy:
            dummy_cols.append(col)
        else:
            lin_cols.append(col)

    if len(dummy_cols)>0:
        temp = pd.get_dummies(temp,columns=dummy_cols,drop_first=True)
    
    length = len(temp)
    if length > 30:
        df_dict[val] = temp

In [39]:
df_dict.keys()

dict_keys(['All'])

In [40]:
test_type = 'lasso'
reg_output = get_split_reg_split(df_dict,test_type=test_type)

In [41]:
reg_output

{'All': (0.8342869740633587, 0.8388934576848128)}

In [16]:
# reg_output = get_split_reg_split(housing,test_type='multiple')

In [17]:
Lasso().get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}