In [1]:
%run config.ipynb

In [2]:
cortex = Cortex.client()
builder = cortex.builder()

In [3]:
train_ds = cortex.dataset('kaggle/ames-housing-train')

In [4]:
pipeline = train_ds.pipeline('features')
train_df = pipeline.run()

running pipeline [clean] for dataset [kaggle/ames-housing-train]:
> drop_unused 
> drop_outliers 
> fill_zero_cols 
> fill_median_cols 
> fill_na_none 
running pipeline [features] for dataset [kaggle/ames-housing-train]:
> scale_target 


In [5]:
y = train_df['SalePrice']

In [6]:
def drop_target(pipeline, df):
    df.drop('SalePrice', 1, inplace=True)
    
def get_dummies(pipeline, df):
    return pd.get_dummies(df)

pipeline = train_ds.pipeline('engineer', depends=['features'], clear_cache=True)
pipeline.reset()
pipeline.add_step(drop_target).add_step(get_dummies)

train_df = pipeline.run()
print('\nTrain shape: (%d, %d)' % train_df.shape)

running pipeline [clean] for dataset [kaggle/ames-housing-train]:
> drop_unused 
> drop_outliers 
> fill_zero_cols 
> fill_median_cols 
> fill_na_none 
running pipeline [features] for dataset [kaggle/ames-housing-train]:
> scale_target 
running pipeline [engineer] for dataset [kaggle/ames-housing-train]:
> drop_target 
> get_dummies 

Train shape: (1458, 303)


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [8]:
def train(x, y, **kwargs):
    alphas = [1, 0.1, 0.001, 0.0001]

    mtype = kwargs.get('model_type')
    if mtype == 'Lasso':
        model = LassoCV(alphas=alphas)
    elif mtype == 'Ridge':
        model = RidgeCV(alphas=alphas)
    else:
        model = LinearRegression()

    model.fit(x, y)
    return model

In [9]:
def predict(model, x, y):
    predictions = model.predict(x)
    rmse = np.sqrt(mean_squared_error(predictions, y))
    return [predictions, rmse]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y.values, test_size=0.20, random_state=10)

In [11]:
best_model = None
best_model_type = None
best_score = 100.0

for model_type in ['Linear','Lasso','Ridge']:
    print('---'*30)
    print('Training model using {} regression algorithm'.format(model_type))
    model = train(X_train, y_train, model_type=model_type)
    [predictions, rmse] = predict(model, X_train, y_train)
    print('Training error:', rmse)
    [predictions, rmse] = predict(model, X_test, y_test)
    print('Testing error:', rmse)
    if rmse < best_score:
        best_score = rmse
        best_model = model
        best_model_type = model_type

------------------------------------------------------------------------------------------
Training model using Linear regression algorithm
Training error: 0.08792096455489082
Testing error: 0.11715496123176918
------------------------------------------------------------------------------------------
Training model using Lasso regression algorithm
Training error: 0.10474725124109076
Testing error: 0.11210731416333446
------------------------------------------------------------------------------------------
Training model using Ridge regression algorithm
Training error: 0.08952814678982611
Testing error: 0.1108089661962949


In [12]:
r2 = best_model.score(X_test, y_test)

print('Best model: ' + best_model_type)
print('Best training error: %.6f' % best_score)
print('Best R2 score: %.6f' % r2)

Best model: Ridge
Best training error: 0.110809
Best R2 score: 0.917111
