In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tpot import TPOTRegressor

In [64]:
def rmserror_log(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [2]:
train = pd.read_csv('houses/clean_train.csv')
test = pd.read_csv('houses/clean_test.csv')
test_ID = test['Id']

del train['Id']
del test['Id']
train["SalePrice"] = np.log1p(train["SalePrice"])

In [None]:
tpot = TPOTRegressor(generations=100, population_size=100, verbosity=2, 
                     n_jobs=1, scoring=rmserror_log)
tpot.fit(train.drop('SalePrice', axis=1).values, train['SalePrice'])

Optimization Progress:  34%|███▎      | 101/300 [10:55<10:39,  3.21s/pipeline] 

Generation 1 - Current best internal CV score: 0.38439377171626965


Optimization Progress:  51%|█████     | 153/300 [24:25<02:07,  1.16pipeline/s]  

Generation 2 - Current best internal CV score: 0.38439377171626965


Optimization Progress:  68%|██████▊   | 203/300 [27:37<04:17,  2.66s/pipeline]

Generation 3 - Current best internal CV score: 0.38439377171626965


Optimization Progress:  71%|███████   | 213/300 [28:10<05:42,  3.94s/pipeline]

In [7]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator

exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True, max_features=1.0, min_samples_leaf=6, min_samples_split=12, n_estimators=100)),
    Normalizer(norm="l1"),
    StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.8, learning_rate=0.01, loss="quantile", max_depth=10, max_features=0.3, min_samples_leaf=5, min_samples_split=18, n_estimators=100, subsample=0.05)),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True, max_features=0.2, min_samples_leaf=6, min_samples_split=8, n_estimators=100)),
    SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.55, n_estimators=100), threshold=0.4),
    StackingEstimator(estimator=LinearSVR(C=25.0, dual=True, epsilon=0.0001, loss="epsilon_insensitive", tol=0.1)),
    Normalizer(norm="max"),
    ExtraTreesRegressor(bootstrap=False, max_features=0.15000000000000002, min_samples_leaf=10, min_samples_split=18, n_estimators=100)
)
exported_pipeline.fit(train.drop('SalePrice', axis=1).values, train['SalePrice'])

Pipeline(memory=None,
     steps=[('stackingestimator-1', StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=6,
          min_samples_split=12, min_weigh...timators=100, n_jobs=1,
          oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [9]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from tpot.builtins import StackingEstimator


# Score on the training set was:-0.3618396735340909
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.75, learning_rate=0.01, loss="ls", max_depth=3, max_features=0.6500000000000001, min_samples_leaf=3, min_samples_split=7, n_estimators=100, subsample=0.8)),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True, max_features=0.7500000000000001, min_samples_leaf=14, min_samples_split=3, n_estimators=100)),
    StandardScaler(),
    SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.9000000000000001, n_estimators=100), threshold=0.1),
    PCA(iterated_power=4, svd_solver="randomized"),
    ExtraTreesRegressor(bootstrap=False, max_features=0.15000000000000002, min_samples_leaf=10, min_samples_split=6, n_estimators=100)
)
exported_pipeline.fit(train.drop('SalePrice', axis=1).values, train['SalePrice'])

Pipeline(memory=None,
     steps=[('stackingestimator-1', StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.75, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=3,
             max_features=0.6500000000000001, max_leaf_nodes=None,
             min_impurity_decrease=0.0, mi...timators=100, n_jobs=1,
          oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [10]:
predictions = np.expm1(exported_pipeline.predict(test))
result_df = pd.DataFrame({'Id': test_ID, 'SalePrice': predictions})
result_df.to_csv('houses/predictions.csv', index=False)

In [32]:
predictions = np.expm1(tpot.predict(test))
result_df = pd.DataFrame({'Id': test_ID, 'SalePrice': predictions})
result_df.to_csv('houses/predictions.csv', index=False)