In [1]:
from featureSelection import featureSelector
import pandas as pd
import numpy as np
import matplotlib
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [2]:
def rmsle(y_true, y_pred):
    assert(len(y_pred) == len(y_true))
    p = np.log(np.array(y_pred) + 1)
    a = np.log(np.array(y_true) + 1)
    return np.sqrt(((p - a)**2).sum() / len(y_pred))

In [3]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

y_train = train[['SalePrice']]
train.drop('SalePrice', axis=1, inplace=True)

In [4]:
train = train.loc[:, (train.dtypes == np.float64) | (train.dtypes == np.int64)].dropna(how='any', axis=0)
test = test[train.columns].dropna(how='any', axis=0)
y_train = y_train.loc[train.index]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [5]:
train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,548,0,61,0,0,0,0,0,2,2008
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,460,298,0,0,0,0,0,0,5,2007
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,608,0,42,0,0,0,0,0,9,2008
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,642,0,35,272,0,0,0,0,2,2006
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,836,192,84,0,0,0,0,0,12,2008


In [6]:
scaler = StandardScaler()
columns = train.columns
train = pd.DataFrame(scaler.fit_transform(train), columns=columns)
test = pd.DataFrame(scaler.transform(test), columns=columns)

selector = featureSelector(Ridge(alpha=1, fit_intercept=True, normalize=True),
                           rmsle, cv=KFold(n_splits=3), prcnt=0.5, 
                           to_keep=1, tol=0.01, mode='reg', verbose=True)
selector.fit(train, y_train.values)

36  :  0.12623052374
18  :  0.132895443497
9  :  0.146743574297
4  :  0.155693384822
2  :  0.180793528687


In [7]:
print(selector.best)
selector.feature_sets[selector.best]

1


[('OverallQual', 0.094097388324786674),
 ('GrLivArea', 0.076910746275902261),
 ('GarageCars', 0.068448774697295281),
 ('TotalBsmtSF', 0.065957350033691589),
 ('1stFlrSF', 0.064238473200760537),
 ('GarageArea', 0.06353750952663989),
 ('TotRmsAbvGrd', 0.057567718953557445),
 ('YearRemodAdd', 0.05742931973418907),
 ('FullBath', 0.056809924304683485),
 ('YearBuilt', 0.054813848854967623),
 ('MasVnrArea', 0.053413343765391454),
 ('Fireplaces', 0.051416496597026212),
 ('GarageYrBlt', 0.049968721462559443),
 ('BsmtFinSF1', 0.044639844817460445),
 ('WoodDeckSF', 0.035946445284416297),
 ('2ndFlrSF', 0.035307181500069278),
 ('LotArea', 0.035084944257308709),
 ('OpenPorchSF', 0.034411968409294266)]