In [1]:
import sys
sys.path.append('data_preprocessing')
import summary
import outliers
import normalize
import learning_helpers

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd

# Get Data

In [2]:
train_data = pd.read_csv('../data/input/ames_train.csv')
test_data = pd.read_csv('../data/input/ames_test.csv')

geo_train_data = pd.read_csv('../data/input/ames_geodemo_train.csv')
geo_test_data = pd.read_csv('../data/input/ames_geodemo_test.csv')

# Normalize and remove outliers

In [3]:
train_data = normalize.fill_in_missing_values(train_data)
train_data = outliers.remove_outliers(train_data)
train_data = normalize.normalize(train_data)

test_data = normalize.fill_in_missing_values(test_data)
test_data = outliers.remove_outliers(test_data)
test_data = normalize.normalize(test_data)

geo_train_data = normalize.fill_in_missing_values(geo_train_data)
geo_train_data = outliers.remove_outliers(geo_train_data)
geo_train_data = normalize.normalize(geo_train_data)

geo_test_data = normalize.fill_in_missing_values(geo_test_data)
geo_test_data = outliers.remove_outliers(geo_test_data)
geo_test_data = normalize.normalize(geo_test_data)

# Split into x and y

In [4]:
y_train = train_data['log_SalePrice']
X_train = train_data.drop('log_SalePrice', 1)
y_test = test_data['log_SalePrice']
X_test = test_data.drop('log_SalePrice', 1)

y_geo_train = geo_train_data['log_SalePrice']
X_geo_train = geo_train_data.drop('log_SalePrice', 1)
y_geo_test = geo_test_data['log_SalePrice']
X_geo_test = geo_test_data.drop('log_SalePrice', 1)

# Not all features may be in test and train at this point due to one hot encoding, 
# so check columns in one and not the other and remove

In [5]:
train_cols = list(X_train)
test_cols = list(X_test)
for col in train_cols:
    if (col not in  test_cols):
        X_train = X_train.drop(col, 1)
        
for col in test_cols:
    if (col not in  train_cols):
        X_test = X_test.drop(col, 1)
        
        
geo_train_cols = list(X_geo_train)
geo_test_cols = list(X_geo_test)
for col in geo_train_cols:
    if (col not in  geo_test_cols):
        X_geo_train = X_geo_train.drop(col, 1)
        
for col in geo_test_cols:
    if (col not in  geo_train_cols):
        X_geo_test = X_geo_test.drop(col, 1)


# Actually do learning

## Linear Regression

In [14]:
lr = linear_model.LinearRegression()
lr, features_lr, r2 = learning_helpers.greedy_feature_selection(lr, X_train, y_train, X_test, y_test, 0.00001)
print('r2: {0}'.format(r2))

r2: 0.937996750965283


In [15]:
lr_geo = linear_model.LinearRegression()
lr_geo, features_lr_geo, r2 = learning_helpers.greedy_feature_selection(lr, X_geo_train, y_geo_train, X_geo_test, y_geo_test, 0.00001)
print('r2: {0}'.format(r2))

r2: 0.9584085509120646


## Ridge

In [16]:
ridge_clf = linear_model.Ridge(alpha=1.0)
ridg_clf, features_ridge, r2 = learning_helpers.greedy_feature_selection(ridge_clf, X_train, y_train, X_test, y_test, 0.00001)
print('r2: {0}'.format(r2))

r2: 0.9437696588481975


In [17]:
ridge_clf_geo = linear_model.Ridge(alpha=1.0)
ridg_clf_geo, features_ridge_geo, r2 = learning_helpers.greedy_feature_selection(ridge_clf, X_geo_train, y_geo_train, X_geo_test, y_geo_test, 0.00001)
print('r2: {0}'.format(r2))

r2: 0.9566529135164344


## Lasso

In [8]:
lasso = linear_model.LassoCV(alphas = [1, 0.1, 0.001, 0.0005],max_iter=100000)
lasso, features_lasso, r2 = learning_helpers.greedy_feature_selection(lasso, X_train, y_train, X_test, y_test, 0.0001)
print('r2: {0}'.format(r2))

r2: 0.9293560800408697


In [12]:
lasso_geo = linear_model.LassoCV(alphas = [1, 0.1, 0.001, 0.0005],max_iter=100000)
lasso_geo, features_lasso_geo, r2 = learning_helpers.greedy_feature_selection(lasso, X_geo_train, y_geo_train, X_geo_test, y_geo_test, 0.00001, True)
print('r2: {0}'.format(r2))

OverallQual
0.714470996797
log_GrLivArea
0.782531694672
BsmtFinSF1
0.828523092375
YearBuilt
0.854525910095
OverallCond
0.877249863608
LotArea
0.892557496367
TotalBsmtSF
0.899893310722
BsmtQual_Ex
0.905583611531
INCPHMBYX
0.909658026092
BldgType_Duplex
0.913261013623
SaleCondition_Abnorml
0.915962470266
Neighborhood_Crawfor
0.917911717
Neighborhood_MeadowV
0.919767665834
Condition1_Norm
0.921603036192
Functional_Typ
0.923569600292
BsmtExposure_No
0.925501537515
county
0.927210079417
GarageCond_TA
0.928697906167
KitchenQual_Ex
0.930185223428
HeatingQC_TA
0.931474429454
Fireplaces
0.932515163162
SaleType_New
0.933606263338
ExterCond_Fa
0.934413955803
YearRemodAdd
0.935293970518
ScreenPorch
0.936042816805
blkgrp
0.936866995967
POPDENBY
0.937388557384
Neighborhood_NWAmes
0.937915903349
log_1stFlrSF
0.938415363767




2ndFlrSF
0.93930728915




PavedDrive_Y
0.939795218682




EnclosedPorch
0.940310335417




HeatingQC_Fa
0.940676867667




Condition1_Artery
0.94101876675




BsmtFinSF2
0.941287341044




MSZoning_C (all)
0.941508063254




HIMEDBYX
0.941758633011




Neighborhood_Gilbert
0.941994779979




BedroomAbvGr
0.942204589506




MasVnrType_Stone
0.9423526357




LowQualFinSF
0.942492774749




HIAVGBYX
0.942601995636




POPBY
0.942751281533




Neighborhood_StoneBr
0.942884952867




ExterQual_Ex
0.943008512662




YrSold
0.943099322736




BsmtFullBath
0.943168194769




ExterCond_TA
0.943264171079




GarageFinish_Fin
0.943327621233




MasVnrType_None
0.943390748727




MasVnrArea
0.943523306718




GarageFinish_RFn
0.943582435897




Neighborhood_CollgCr
0.943614033265




3SsnPorch
0.943643559522




Neighborhood_NridgHt
0.943669668732




Neighborhood_OldTown
0.943691625037




Neighborhood_NAmes
0.943719600037




MSZoning_RM
0.943800795346




Neighborhood_BrkSide
0.943868027102




BldgType_2fmCon
0.943882091966




Exterior1st_Plywood
0.943894044003




LotConfig_FR2
0.943904501668




Neighborhood_Edwards
0.943916473593




HouseStyle_SLvl
0.943923476493




SaleCondition_Family
0.943927579429




tract
0.94393045307




BsmtExposure_Gd
0.943947738255




BsmtExposure_Av
0.94397879704




ExterQual_TA
0.944001975866




OpenPorchSF
0.944034718873




VACANTHUBY
0.944046051912




Exterior2nd_Plywood
0.944058085771




TotRmsAbvGrd
0.944065222882




KitchenQual_Gd
0.944070349742




KeyboardInterrupt: 