In [1]:
import sys
sys.path.append('../ames') # path the the directory

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.api import OLS
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import re
import itertools
import statistics
%matplotlib inline 

In [2]:
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression

In [3]:
import config
from data_prep import clean, add_features

In [4]:
housing_initial = clean(config.HOUSING_CSV)
housing = housing_initial.copy()
housing = add_features(housing)

In [5]:
lin_reg_cols = ['Toilets','WoodDeckSF','TotRmsAbvGrd','BedroomAbvGr','Showers',
                'FirePlaces','YearBuilt','YearRemodAdd',
                'TotalBsmtSF','DecadeBuilt'
               ]

cols_to_dummy = ['Foundation','RoofStyle','SaleCondition',
                 'Neighborhood','Electrical','HouseStyle','SaleType','MSSubClass',
                 'LandContour','PavedDrive','GarageFinish','MSZoning'
                ]

dummy_suffix = ['Qual','Cond','Type']
for col in housing.columns:
    if col not in cols_to_dummy and col not in lin_reg_cols:
        for suff in dummy_suffix:
            #print(re.search(r'%s$'%dummy_suffix, col))
            if re.search(r'%s$'%suff, col) is not None:
                cols_to_dummy.append(col)
                break

In [6]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression

def perform_lasso_ridge_test(set_df,test_type):
    if test_type == 'ridge':
        test = Ridge()
    elif test_type == 'lasso':
        test = Lasso()
    
    prices = set_df['SalePrice']
    del set_df['SalePrice']
        
    coefs = []
    intercepts = []
    R2  = []
    alphaRange = [.0005,.0001,.01]
    for alpha in alphaRange:
        test.set_params(alpha=alpha)  
        test.fit(set_df,prices)
        intercepts.append(test.intercept_)
        coefs.append(test.coef_)
        R2.append(test.score(set_df,prices))
    return statistics.mean(R2)

In [7]:
def get_mll_vals(set_df):
    lm = LinearRegression()
    prices = set_df['SalePrice']
    del set_df['SalePrice']
    
    lm.fit(set_df,prices)

    r2 = lm.score(set_df, prices)
    return r2

In [8]:
def get_split_reg_split(df_dict_func,test_type='lasso'):
    output = {}
    for key,data_start in df_dict.items():
        data = data_start.copy()
        if test_type == 'lasso' or test_type == 'ridge':
            val = perform_lasso_ridge_test(data,test_type)
        elif test_type in ['mll','multiple_lin','multiple_linear','multiple']:
            val = get_mll_vals(data)
        output[key] = val
    return output

In [9]:
split_variable = 'SalePrice'
reg_variables = ['Neighborhood','OverallQual','BedroomAbvGr','TotalBsmtSF','DecadeBuilt','TotRmsAbvGrd','KitchenQual']
test_var = 'SalePrice'
split_type = 'mean'

In [10]:
if split_type == 'mean':
    test_val = housing[split_variable].mean()
elif split_type == 'median':
    test_val = housing[split_variable].median()
else:
    test_val = split_type

sub_df = housing.copy()
sub_df = sub_df.fillna(0.0)
sub_df = sub_df[reg_variables+['SalePrice']]

dummy_cols = []
lin_cols = []
for col in sub_df.columns:
    if col in cols_to_dummy:
        dummy_cols.append(col)
    else:
        lin_cols.append(col)

if len(dummy_cols)>0:
    sub_df = pd.get_dummies(sub_df,columns=dummy_cols,drop_first=True)

high_df = sub_df[sub_df[split_variable]>test_val]
low_df = sub_df[sub_df[split_variable]<=test_val]

df_dict = {'high':high_df,'low':low_df}

In [14]:
reg_variables = ['TotalLivingArea','OverallCond']
dimension = 'Neighborhood'

df_dict = {}
for val in housing[dimension].unique():
    temp = housing.copy()
    temp = temp[temp[dimension]==val]
    
    temp = temp[reg_variables+['SalePrice']]
    
    dummy_cols = []
    lin_cols = []
    for col in temp.columns:
        if col in cols_to_dummy:
            dummy_cols.append(col)
        else:
            lin_cols.append(col)

    if len(dummy_cols)>0:
        temp = pd.get_dummies(temp,columns=dummy_cols,drop_first=True)
    
    length = len(temp)
    if length > 30:
        df_dict[val] = temp

In [15]:
df_dict.keys()

dict_keys(['SWISU', 'Edwards', 'IDOTRR', 'OldTown', 'NWAmes', 'Mitchel', 'NridgHt', 'NAmes', 'CollgCr', 'SawyerW', 'MeadowV', 'Gilbert', 'Timber', 'Somerst', 'BrkSide', 'NoRidge', 'Sawyer', 'Crawfor', 'ClearCr', 'StoneBr'])

In [16]:
test_type = 'lasso'
reg_output = get_split_reg_split(df_dict,test_type=test_type)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [18]:
reg_output

{'SWISU': 0.6671377410604785,
 'Edwards': 0.46552941098232054,
 'IDOTRR': 0.6887827551515066,
 'OldTown': 0.6444518829700889,
 'NWAmes': 0.6768281834492278,
 'Mitchel': 0.5288598372980658,
 'NridgHt': 0.7053054567165835,
 'NAmes': 0.6172538204577194,
 'CollgCr': 0.6298925045937475,
 'SawyerW': 0.7684615550883821,
 'MeadowV': 0.7746127562453501,
 'Gilbert': 0.4207358298223686,
 'Timber': 0.47265198836610706,
 'Somerst': 0.5684699580477014,
 'BrkSide': 0.6603960476823437,
 'NoRidge': 0.7110002987362727,
 'Sawyer': 0.48333621022914774,
 'Crawfor': 0.7222331466041012,
 'ClearCr': 0.5081698531354618,
 'StoneBr': 0.6196613143460676}

In [17]:
# reg_output = get_split_reg_split(housing,test_type='multiple')