In [1]:
import sys
sys.path.append('../ames') # path the the directory

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.api import OLS
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import re
import itertools
import statistics
%matplotlib inline 

In [2]:
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression

In [3]:
import config
from data_prep import clean, add_features

In [4]:
housing_initial = clean(config.HOUSING_CSV)
housing = housing_initial.copy()
housing = add_features(housing)

In [5]:
lin_reg_cols = ['Toilets','WoodDeckSF','TotRmsAbvGrd','BedroomAbvGr','Showers',
                'FirePlaces','YearBuilt','YearRemodAdd',
                'TotalBsmtSF','DecadeBuilt'
               ]

cols_to_dummy = ['Foundation','RoofStyle','SaleCondition',
                 'Neighborhood','Electrical','HouseStyle','SaleType','MSSubClass',
                 'LandContour','PavedDrive','GarageFinish','MSZoning'
                ]

dummy_suffix = ['Qual','Cond','Type']
for col in housing.columns:
    if col not in cols_to_dummy and col not in lin_reg_cols:
        for suff in dummy_suffix:
            #print(re.search(r'%s$'%dummy_suffix, col))
            if re.search(r'%s$'%suff, col) is not None:
                cols_to_dummy.append(col)
                break

In [6]:
def get_mll_vals(temp_df,sale_prices):
    lm = LinearRegression()
    lm.fit(temp_df, sale_prices)

    r2 = lm.score(temp_df, sale_prices)
    residuals = sale_prices - lm.predict(temp_df)
    rss = np.sum((residuals)**2)

    coef = lm.coef_
    intercept = lm.intercept_
    prfx_vals = {'r^2':r2,
                 #'coef':coef,
                 #'intercept':intercept,
                 'rss':rss
                }
    return prfx_vals

In [19]:
def do_multiple_lin(data_frame,all_cols):
    
    dummy_cols = []
    lin_cols = []
    for col in all_cols:
        if col in cols_to_dummy:
            dummy_cols.append(col)
        else:
            lin_cols.append(col)
            
    data_frame = data_frame.fillna(0.0)
    data_frame = data_frame.filter(items=lin_cols+dummy_cols+['SalePrice'])
    if len(dummy_cols)>0:
        data_frame = pd.get_dummies(data_frame,columns=dummy_cols,drop_first=True)

    sale_prices = data_frame['SalePrice']

    del data_frame['SalePrice']
    name = '_'.join(all_cols)
    
    output = get_mll_vals(data_frame,sale_prices)
    return (name,output['r^2'])

In [23]:
# num_variables = 3
# combos = list(itertools.combinations(lin_reg_cols+cols_to_dummy, num_variables))
combos = [('Toilets', 'WoodDeckSF', 'TotRmsAbvGrd'),('WoodDeckSF', 'FirePlaces', 'MSZoning')]

In [24]:
count = 1
values_dict = {}

for combo in combos:
    output = do_multiple_lin(housing,combo)
    values_dict[output[0]] = output[1]
    if count % 1000 == 0:
        print(count)
    count+=1

In [25]:
reg_vals_df = pd.DataFrame({'Attribute':values_dict.keys(),'R-Squared':values_dict.values()})

reg_vals_df = reg_vals_df.sort_values('R-Squared',ascending=False)
reg_vals_df.to_csv('test.csv',index=False)

In [27]:
lin_reg_vals

NameError: name 'lin_reg_vals' is not defined

In [30]:
features = sm.add_constant(housing[['SalePrice']])
lin_reg_vals = {}

for col in lin_reg_cols:
    try:
        housing[col] = housing[col].fillna(0.0)
        ols_sm  = OLS(housing[col], features)
        model = ols_sm.fit()
        lin_reg_vals[col] = model.rsquared
    except:
        pass
    
lin_reg_vals = dict(sorted(lin_reg_vals.items(), key=lambda item: item[1],reverse=True))

In [32]:
lin_reg_df = pd.DataFrame({'Attribute':lin_reg_vals.keys(),'R-Squared':lin_reg_vals.values()})

lin_reg_df.to_csv('rsquared_lin.csv',index=False)

In [33]:
lin_reg_df

Unnamed: 0,Attribute,R-Squared
0,TotalBsmtSF,0.425808
1,Toilets,0.386412
2,Showers,0.354436
3,YearBuilt,0.296555
4,DecadeBuilt,0.29145
5,YearRemodAdd,0.264937
6,TotRmsAbvGrd,0.240302
7,WoodDeckSF,0.11111
8,BedroomAbvGr,0.023081
