In [18]:
import sys
sys.path.append('../ames') # path the the directory

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.api import OLS
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import re
import itertools
import statistics
%matplotlib inline 

In [19]:
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression

In [20]:
import config
from data_prep import clean, add_features

In [21]:
housing_initial = clean(config.HOUSING_CSV)
housing = housing_initial.copy()
housing = add_features(housing)

In [22]:
lin_reg_cols = ['Toilets','WoodDeckSF','TotRmsAbvGrd','BedroomAbvGr','Showers',
                'FirePlaces','YearBuilt','YearRemodAdd',
                'TotalBsmtSF','DecadeBuilt'
               ]

cols_to_dummy = ['Foundation','RoofStyle','SaleCondition',
                 'Neighborhood','Electrical','HouseStyle','SaleType','MSSubClass',
                 'LandContour','PavedDrive','GarageFinish','MSZoning'
                ]

dummy_suffix = ['Qual','Cond','Type']
for col in housing.columns:
    if col not in cols_to_dummy and col not in lin_reg_cols:
        for suff in dummy_suffix:
            #print(re.search(r'%s$'%dummy_suffix, col))
            if re.search(r'%s$'%suff, col) is not None:
                cols_to_dummy.append(col)
                break

In [28]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression

def perform_lasso_ridge_test(set_df,test_type):
    if test_type == 'ridge':
        test = Ridge()
    elif test_type == 'lasso':
        test = Lasso()
    
    prices = set_df['SalePrice']
    del set_df['SalePrice']
        
    coefs = []
    intercepts = []
    R2  = []
    alphaRange = [.0005,.0001,.01,1,5]
    for alpha in alphaRange:
        test.set_params(alpha=alpha)  
        test.fit(set_df,prices)
        intercepts.append(test.intercept_)
        coefs.append(test.coef_)
        R2.append(test.score(set_df,prices))
    return statistics.mean(R2)

In [29]:
def get_mll_vals(set_df):
    lm = LinearRegression()
    prices = set_df['SalePrice']
    del set_df['SalePrice']
    
    lm.fit(set_df,prices)

    r2 = lm.score(set_df, prices)
    return r2

In [93]:
# def get_split_reg_split(data_frame,split_variable,reg_variables,split_type='mean',test_type='lasso'):
#     if split_type == 'mean':
#         test_val = data_frame[split_variable].mean()
#     elif split_type == 'median':
#         test_val = data_frame[split_variable].median()
#     else:
#         test_val = split_type
    
#     data_frame = data_frame[reg_variables+['SalePrice']]
    
#     dummy_cols = []
#     lin_cols = []
#     for col in data_frame.columns:
#         if col in cols_to_dummy:
#             dummy_cols.append(col)
#         else:
#             lin_cols.append(col)

#     data_frame = data_frame.fillna(0.0)
    
#     if len(dummy_cols)>0:
#         data_frame = pd.get_dummies(data_frame,columns=dummy_cols,drop_first=True)
    
#     high_df = data_frame[data_frame[split_variable]>test_val]
#     low_df = data_frame[data_frame[split_variable]<=test_val]
    
#     df_dict = {'high':high_df,'low':low_df}
#     #df_dict = {'low':low_df}
    

#     output = {}
#     for key,data in df_dict.items():
#         if test_type == 'lasso' or test_type == 'ridge':
#             val = perform_lasso_ridge_test(data,test_type)
#         elif test_type in ['mll','multiple_lin','multiple_linear','multiple']:
#             val = get_mll_vals(data)
#         output[key] = val
#     return output

In [117]:
def get_split_reg_split(df_dict_func,test_type='lasso'):
    output = {}
    for key,data_start in df_dict.items():
        data = data_start.copy()
        if test_type == 'lasso' or test_type == 'ridge':
            val = perform_lasso_ridge_test(data,test_type)
        elif test_type in ['mll','multiple_lin','multiple_linear','multiple']:
            val = get_mll_vals(data)
        output[key] = val
    return output

In [118]:
split_variable = 'SalePrice'
reg_variables = ['Neighborhood','OverallQual','BedroomAbvGr','TotalBsmtSF','DecadeBuilt','TotRmsAbvGrd','KitchenQual']
test_var = 'SalePrice'
split_type = 'mean'

In [124]:
if split_type == 'mean':
    test_val = housing[split_variable].mean()
elif split_type == 'median':
    test_val = housing[split_variable].median()
else:
    test_val = split_type

sub_df = housing.copy()
sub_df = sub_df[reg_variables+['SalePrice']]

dummy_cols = []
lin_cols = []
for col in sub_df.columns:
    if col in cols_to_dummy:
        dummy_cols.append(col)
    else:
        lin_cols.append(col)

sub_df = sub_df.fillna(0.0)

if len(dummy_cols)>0:
    sub_df = pd.get_dummies(sub_df,columns=dummy_cols,drop_first=True)

high_df = sub_df[sub_df[split_variable]>test_val]
low_df = sub_df[sub_df[split_variable]<=test_val]

df_dict = {'high':high_df,'low':low_df}

In [125]:
test_type = 'ridge'
reg_output = get_split_reg_split(df_dict,test_type=test_type)

In [126]:
reg_output

{'zzz': 0.7285826708363523, 'low': 0.6562133786218576}

In [122]:
reg_output = get_split_reg_split(housing,test_type='multiple')

In [123]:
reg_output

{'high': 0.7302952956044477, 'low': 0.6584285597421635}