In [531]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
%matplotlib inline 

housing_initial = pd.read_csv('./data/Ames_Housing_Price_Data.csv', index_col=0)

In [532]:
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression

In [533]:
def adjust_data_frame(data_frame):
    rename = {'heatingqc':'heatingqual'}
    for col in data_frame:
        new_col = col.lower()
        data_frame = data_frame.rename(columns={col:new_col})
        
    data_frame['has_pool'] = data_frame['poolarea'].apply(lambda x: 1 if x > 0 else 0)
    data_frame['centralair'] = data_frame['centralair'].apply(lambda x: 1 if x == 'Y' else 0)
    data_frame['toilets'] = data_frame['halfbath'] + data_frame['fullbath']
    data_frame['showers'] = data_frame['fullbath']
    
    data_frame = data_frame.rename(columns=rename)
    return data_frame

In [549]:
housing = housing_initial.copy()
housing = adjust_data_frame(housing)

In [550]:
def get_mll_vals(temp_df):
    lm = LinearRegression()
    lm.fit(temp_df, sale_prices)

    r2 = lm.score(temp_df, sale_prices)
    residuals = sale_prices - lm.predict(temp_df)
    rss = np.sum((residuals)**2)

    coef = lm.coef_
    intercept = lm.intercept_
    prfx_vals = {'r^2':r2,
                 #'coef':coef,
                 #'intercept':intercept,
                 'rss':rss
                }
    return prfx_vals

In [551]:
# def do_multiple_lin(data_frame,col_type):
#     prfx_options = ['overall','exter','bsmt','kitchen','garage','heating','bldgtype','centralair','roofmatl','roofstyle','has_pool','foundation']
    
#     col_df = data_frame.filter(regex='%s$|saleprice'%col_type)
#     #quality_df.isnull().sum(axis=0)

#     dummy_cols = list(col_df.columns).remove('saleprice')
#     #dummy_cols.remove('saleprice')
#     col_df = pd.get_dummies(col_df,columns=dummy_cols,drop_first=True)
    
#     sale_prices = quality_df['saleprice']
#     values = {}
    
#     for prfx in prfx_options:
#         temp_df = col_df.copy()
#         del temp_df['saleprice']
#         temp_df = temp_df.filter(regex=prfx)
#         if temp_df.shape[1] > 0:
#             values[prfx] = get_mll_vals(temp_df)
#     return values

In [552]:
def do_multiple_lin(data_frame,col_types):
    prfx_options = ['overall','exter','bsmt','heating','masvnr','kitchen','lot','neighborhood','garage']
    
    values = {}
    
    for col_type in col_types:
        col_df = data_frame.filter(regex='%s$|saleprice'%col_type)
        if col_type in col_df:
            prfx_options_use = [col_type]
        else:
            prfx_options_use = prfx_options

        dummy_cols = list(col_df.columns).remove('saleprice')
        col_df = pd.get_dummies(col_df,columns=dummy_cols,drop_first=True)

        sale_prices = quality_df['saleprice']
        for prfx in prfx_options_use:
            temp_df = col_df.copy()
            del temp_df['saleprice']
            temp_df = temp_df.filter(regex=prfx)
            if temp_df.shape[1] > 0:
                #print(prfx)
                if col_type == prfx:
                    name = name = col_type
                else:
                    name = '%s%s' % (prfx,col_type)
                #print(name)
                output = get_mll_vals(temp_df)
                values[name] = output['r^2']
    return values

In [553]:
mll_cols = ['qual','cond','heating','foundation','roofstyle','roofmatl','bldgtype',
            'neighborhood','type','electrical','housestyle','functional','saletype','mssubclass',
            'landcontour','yrsold','mosold','paveddrive','housestyle','garagefinish','mszoning','fence'
           ]

#mll_cols = ['qual']

categorical_vals = do_multiple_lin(housing,mll_cols)
categorical_vals = dict(sorted(categorical_vals.items(), key=lambda item: item[1],reverse=True))

In [554]:
categorical_vals

{'overallqual': 0.6248925373371652,
 'neighborhood': 0.5598077663859118,
 'exterqual': 0.4853009289061254,
 'kitchenqual': 0.45785794401358726,
 'bsmtqual': 0.3162074986324529,
 'foundation': 0.2671350890643003,
 'garagetype': 0.23154901739229283,
 'heatingqual': 0.20235536653584862,
 'garagefinish': 0.17544927153842826,
 'masvnrtype': 0.17171662226724904,
 'mszoning': 0.11600712743510999,
 'garagequal': 0.0832951603369062,
 'saletype': 0.08130860205161039,
 'garagecond': 0.07815590410722295,
 'paveddrive': 0.07247451559741591,
 'housestyle': 0.07103279739739266,
 'roofstyle': 0.05870948148127286,
 'electrical': 0.0564879727663965,
 'bsmtcond': 0.04840595199785991,
 'fence': 0.035707927276794726,
 'bldgtype': 0.03303709816401801,
 'landcontour': 0.03234993583246448,
 'extercond': 0.021968757422164087,
 'roofmatl': 0.020763632343995697,
 'functional': 0.01366006518798657,
 'overallcond': 0.010219487984851061,
 'heating': 0.007934972875623836,
 'mssubclass': 0.007155008656742345,
 'mosol

In [555]:
lin_reg_cols = ['toilets','wooddecksf','totrmsabvgrd','bedroomabvgr','showers','kitchenabvgr',
                'fireplaces','yearbuilt','yearremodadd','enclosedporch','screenporch',
                'totalbsmtsf','3ssnporch'
               ]
for col in housing.columns:
    if 'area' in col:
        lin_reg_cols.append(col)

In [556]:
features = sm.add_constant(housing[['saleprice']])
lin_reg_vals = {}

for col in lin_reg_cols:
    housing[col] = housing[col].fillna(0.0)
    ols_sm  = OLS(housing[col], features)
    model = ols_sm.fit()
    lin_reg_vals[col] = model.rsquared
    
lin_reg_vals = dict(sorted(lin_reg_vals.items(), key=lambda item: item[1],reverse=True))

In [557]:
lin_reg_vals

{'grlivarea': 0.5183732056699005,
 'totalbsmtsf': 0.42574893740677966,
 'garagearea': 0.4028044155186665,
 'yearbuilt': 0.2963386692622054,
 'toilets': 0.29467390233984414,
 'showers': 0.28620549197746104,
 'yearremodadd': 0.2645135075446047,
 'masvnrarea': 0.2484343306454504,
 'totrmsabvgrd': 0.24031508853002126,
 'fireplaces': 0.23832326199307108,
 'wooddecksf': 0.11104333043218417,
 'lotarea': 0.07531921127368835,
 'bedroomabvgr': 0.02304512827536065,
 'enclosedporch': 0.015186832015373963,
 'screenporch': 0.01475335746883455,
 'kitchenabvgr': 0.01299070277321912,
 '3ssnporch': 0.0009771955140202015,
 'poolarea': 0.0009513780180609999}

In [561]:
lin_reg_df = pd.DataFrame({'Attribute':lin_reg_vals.keys(),'R-Squared':lin_reg_vals.values()})
lin_reg_df['Reg Type'] = 'Categorical'
categorical_df = pd.DataFrame({'Attribute':categorical_vals.keys(),'R-Squared':categorical_vals.values()})
categorical_df['Reg Type'] = 'Multiple'

combined = lin_reg_df.append(categorical_df)
combined = combined.sort_values('R-Squared',ascending=False)
combined.to_csv('rsquared.csv')

In [562]:
for i in housing.columns:
    if i not in combined['Attribute'].unique():
        print(i)

pid
saleprice
lotfrontage
street
alley
lotshape
utilities
lotconfig
landslope
condition1
condition2
exterior1st
exterior2nd
bsmtexposure
bsmtfintype1
bsmtfinsf1
bsmtfintype2
bsmtfinsf2
bsmtunfsf
centralair
1stflrsf
2ndflrsf
lowqualfinsf
bsmtfullbath
bsmthalfbath
fullbath
halfbath
fireplacequ
garageyrblt
garagecars
openporchsf
poolqc
miscfeature
miscval
salecondition
has_pool
