In [14]:
%run config.ipynb

In [15]:
cortex = Cortex.client()
builder = cortex.builder()

In [16]:
train_ds = cortex.dataset('kaggle/ames-housing-train')

In [17]:
pipeline = train_ds.pipeline('features')
train_df = pipeline.run()

running pipeline [clean] for dataset [kaggle/ames-housing-train]:
> drop_unused 
> drop_outliers 
> fill_zero_cols 
> fill_median_cols 
> fill_na_none 
running pipeline [features] for dataset [kaggle/ames-housing-train]:
> scale_target 


In [18]:
y = train_df['SalePrice']

In [19]:
def drop_target(pipeline, df):
    df.drop('SalePrice', 1, inplace=True)
    
def get_dummies(pipeline, df):
    return pd.get_dummies(df)

pipeline = train_ds.pipeline('engineer', depends=['features'], clear_cache=True)
pipeline.reset()
pipeline.add_step(drop_target)
pipeline.add_step(get_dummies)

# Run the feature engineering pipeline to prepare for model training
train_df = pipeline.run()
print('\nTrain shape: (%d, %d)' % train_df.shape)

running pipeline [clean] for dataset [kaggle/ames-housing-train]:
> drop_unused 
> drop_outliers 
> fill_zero_cols 
> fill_median_cols 
> fill_na_none 
running pipeline [features] for dataset [kaggle/ames-housing-train]:
> scale_target 
running pipeline [engineer] for dataset [kaggle/ames-housing-train]:
> drop_target 
> get_dummies 

Train shape: (1458, 303)


In [20]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [21]:
def train(x, y, **kwargs):
    alphas = kwargs.get('alphas', [1, 0.1, 0.001, 0.0001])

    # Select alogrithm
    mtype = kwargs.get('model_type')
    if mtype == 'Lasso':
        model = LassoCV(alphas=alphas)
    elif mtype == 'Ridge':
        model = RidgeCV(alphas=alphas)
    elif mtype == 'ElasticNet':
        model = ElasticNetCV(alphas=alphas)
    else:
        model = LinearRegression()

    # Train model
    model.fit(x, y)
    
    return model

In [22]:
def predict_and_score(model, x, y):
    predictions = model.predict(x)
    rmse = np.sqrt(mean_squared_error(predictions, y))
    return [predictions, rmse]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y.values, test_size=0.20, random_state=10)

In [27]:
%%time

best_model = None
best_model_type = None
best_rmse = 1.0

exp = cortex.experiment('kaggle/ames-housing-regression')
# exp.reset()
exp.set_pipeline(pipeline)
exp.set_meta('style', 'supervised')
exp.set_meta('function', 'regression')

with exp.start_run() as run:
    alphas = [1, 0.1, 0.001, 0.0005]
    for model_type in ['Linear', 'Lasso', 'Ridge', 'ElasticNet']:
        print('---'*30)
        print('Training model using {} regression algorithm'.format(model_type))
        model = train(X_train, y_train, model_type=model_type, alphas=alphas)
        [predictions, rmse] = predict_and_score(model, X_train, y_train)
        print('Training error:', rmse)
        [predictions, rmse] = predict_and_score(model, X_test, y_test)
        print('Testing error:', rmse)
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model
            best_model_type = model_type
    
    r2 = best_model.score(X_test, y_test)
    run.log_metric('r2', r2)
    run.log_metric('rmse', best_rmse)
    run.log_param('model_type', best_model_type)
    run.log_param('alphas', alphas)
    run.log_artifact('model', best_model)

print('---'*30)

------------------------------------------------------------------------------------------
Training model using Linear regression algorithm
Training error: 0.08792096455489082
Testing error: 0.11715496123176918
------------------------------------------------------------------------------------------
Training model using Lasso regression algorithm
Training error: 0.09951434597883627
Testing error: 0.10838639184071473
------------------------------------------------------------------------------------------
Training model using Ridge regression algorithm
Training error: 0.08952814678982611
Testing error: 0.1108089661962949
------------------------------------------------------------------------------------------
Training model using ElasticNet regression algorithm
Training error: 0.09986249373851433
Testing error: 0.10851964458526744
------------------------------------------------------------------------------------------
CPU times: user 1.51 s, sys: 19.9 ms, total: 1.53 s
Wall time: 2

In [28]:
print('Best model: ' + best_model_type)
print('Best testing error: %.6f' % best_rmse)
print('R2 score: %.6f' % r2)

Best model: Lasso
Best testing error: 0.108386
R2 score: 0.920696


In [29]:
exp.display()

ID,Date,Took,Params,Params,Metrics,Metrics
ID,Date,Took,alphas,model_type,r2,rmse
yr74ahb,"Sun, 19 Aug 2018 18:08:34 GMT",1.87 s,"[1, 0.1, 0.001, 0.0001]",ElasticNet,0.920501,0.10852
pd94aen,"Sun, 19 Aug 2018 18:08:58 GMT",1.40 s,"[1, 0.1, 0.001, 0.0005]",Lasso,0.920696,0.108386
r80eaae,"Sun, 19 Aug 2018 18:11:42 GMT",1.41 s,"[1, 0.1, 0.001, 0.0005]",Lasso,0.920696,0.108386
1m4eafp,"Mon, 20 Aug 2018 22:48:56 GMT",2.01 s,"[1, 0.1, 0.001, 0.0001]",ElasticNet,0.920501,0.10852
h79eaf2,"Mon, 20 Aug 2018 22:49:39 GMT",1.47 s,"[1, 0.1, 0.001, 0.0005]",Lasso,0.920696,0.108386
