In [1]:
from config import *
from utils import *
from model import HousePriceModel


"""House Price Predicting

This Code Implement Ensemble Learning Capability, which means it can train on several (different) models, 
and integrate all the outputs as final outpus.

Arguments are names of models, like xgb, nn. Configurations for models are set Default.

"""

if __name__ == '__main__':
    train_data = pd.read_csv(PREPROCESSED_TRAINING_DATA_PATH)
    train_data = train_data.drop(['Unnamed: 0'], axis=1)
    train_data["SalePrice"] = np.log1p(train_data["SalePrice"])
    test_data = pd.read_csv(PREPROCESSED_TEST_DATA_PATH)
    test_data = test_data.drop(['Unnamed: 0'], axis=1)

    args = sys.argv[1:]

    CLFs = HousePriceModel()

    ensemble_models = []
    """
    ensemble_models contains a set of models, to perform ensemble learning on several models.

    It contains a List of models, for each #element:
        [0. representation name,
        1. model name,
        2. [model config(dict), training config (dict)]

        Sample cofig: (No specification is allowed, which means using default values)
        
            !!! Check those PARAMETERS on Sklearn Package !!!
        
            I. model config
                a) xgb
                    {
                    'learning_rate': learning_rate,
                    'n_estimators': n_estimators,
                    'min_child_weight': min_child_weight,
                    'booster': booster
                    }
                b) nn
                    {
                    'hidden_layer_sizes': hidden_layer_sizes,
                    'activation': activation,
                    'alpha': alpha,
                    'learning_rate': learning_rate
                    }

            II. training config (dict)
                a) xgb
                    {
                    'num_features': num_features,
                    'split': split,
                    'eval_set': eval_set,
                    'eval_metric': eval_metric,
                    'verbose': verbose,
                    'xgb_model': xgb_model
                    }
                b) nn
                    {
                    'num_features': num_features,
                    'split': split,
                    }
    """
#     if len(args) > 0:
#         for arg in args:
#             ensemble_models.append([arg, {}, {}])
#     else:
    grid_params = {
            'xgb':{'learning_rate':[0.2, 0.1, 0.05, 0.02],
                   'n_estimators':[100, 200, 300, 400, 500],
                   'min_child_weight':[3, 4, 5, 6],
                   'booster':['gbtree', 'gblinear', 'dart']},
            'nn':{'hidden_layer_sizes':[(16,16)],
                  'activation':['identity', 'relu','logistic'],
                  'alpha':[0.0001, 0.0005, 0.001],
                  'learning_rate':['adaptive', 'invscaling'],
                  'max_iter':[200, 500, 1000]},
            'svr':{'degree':[2, 3, 4],
                   'kernel': ['rbf', 'poly', 'linear'],
                   'gamma': [1e-3, 1e-4, 1e-5],
                   'C': [1.0, 0.5, 2],
                   'coef0':[0.0, 0.1, 0.2],
                   'tol':[1e-3, 1e-4, 5e-4, 2e-3]},
            'randF':{'n_estimators':[5, 10, 15, 20],
                     'criterion':['mse'],
                     'max_depth':[None, 10, 15, 20],
                     'min_samples_split':[2, 5, 8, 10],
                     'min_samples_leaf':[1, 3, 5]},
            'bagging':{'base_estimator':[None],
                       'n_estimators':[3, 7, 10, 13],
                       'max_samples':[0.5, 0.8, 1.0],
                       'max_features':[0.5, 0.8, 1.0],
                       'bootstrap':[True, False]},
            'logistic':{'penalty':['l2'],
                        'dual':[False],
                        'tol':[1e-4, 1e-5, 5e-5, 5e-4, 1e-3],
                        'C':[0.2, 0.5, 1.0, 1.5]},
            'dt':{'criterion':['mse'],
                  'splitter':['best', 'random'],
                  'max_depth':[None, 5, 8, 10, 12, 15],
                  'min_samples_split':[2, 5, 10, 15],
                  'min_samples_leaf':[1, 2, 4, 8],
                  'min_weight_fraction_leaf':[0., 0.01, 0.02, 0.05],
                 'min_impurity_decrease':[1e-7, 1e-6, 1e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2]}
        }

    m = train_data.shape[0]
    y_train = np.reshape(train_data['SalePrice'], newshape=(m,))
    train_output_list = []
    test_output_list = []

    reps_list = ['dt']
    for t in reps_list:
        print("== Grid {} Parameters ...".format(t))
        # Create Dummy estimator
        dummy_model = CLFs.add_model(t, name='dummy', config=[{}, {'split': 0}])
        X_, y_ = CLFs.get_Xy(train_data, representation_name=t, name=dummy_model)
        # GridSearch
        best_model, best_params = grid(CLFs.models[t][dummy_model]['model'], grid_params[t], X_, y_)
        print(best_params)
        np.save('{}_cv_results.npy'.format(t), best_model.cv_results_)
        # Add Best Model
        best_xgb = CLFs.add_model(t,name='best',config=[best_params, {}])
        # Fit Best Model
        CLFs.fit(t, dataFrame=train_data, name=best_xgb)
        # Evaluate on test set
        X_test = CLFs.get_Xy(test_data, representation_name=t, name='best', bool_train=False)
        test_output = CLFs.predict(representation_name=t, X=X_test, name='best')
        test_output_list.append(test_output)
        
    final_output = np.exp(CLFs.ensemble_outputs(test_output_list))

    print('== Fill in submission ...')
    # Fill submission.csv
    submission = pd.read_csv(SUBMISSION_PATH)
    CLFs.fill_submission(final_output, submission)
    # Delete dummy columns
    sub_cols_all = submission.columns
    sub_cols = ['Id', 'SalePrice']
    for c in sub_cols_all:
        if c not in sub_cols:
            submission = submission.drop(columns=c)
    # Save submission file
    submission.to_csv(SUBMISSION_PATH, index=False)

    print("== Process Successed!")

  return f(*args, **kwds)


== Grid dt Parameters ...
Fitting 5 folds for each of 6144 candidates, totalling 30720 fits


[Parallel(n_jobs=-1)]: Done 2196 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 13896 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 30720 out of 30720 | elapsed:   27.8s finished


	-- Test MSE:0.017473682750651687
{'criterion': 'mse', 'max_depth': 15, 'min_impurity_decrease': 1e-07, 'min_samples_leaf': 4, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'splitter': 'random'}
	-- Validation MSE of Model--best: 0.03270600649950598
== Fill in submission ...
== Process Successed!


In [2]:
np.save('dt_output.npy', final_output)

In [5]:
best_model_xgb.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_min_samples_leaf', 'param_min_samples_split', 'param_min_weight_fraction_leaf', 'param_splitter', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [10]:
for k in  grid_params['dt'].keys():
    try:
        print(best_model_xgb.cv_results_['param_{}'.format(k)])
    except:
        print('No such param {}'.format(k))

['mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse'
 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'mse' 'm