In [1]:
from mlens.ensemble import SuperLearner
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.stats import uniform, randint
from mlens.preprocessing import EnsembleTransformer
from mlens.metrics import make_scorer
from mlens.model_selection import Evaluator
from sklearn.base import BaseEstimator
from model_super import *
from models import *
from ensembler import *
from parameters import *
from data_processing import *
from gridSearch import *
import os.path as path
import sys
import matplotlib.pyplot as plt

#results of iteration 1 (all models):  example.csv for params, 0.052445
#results of iteration 2 (three models):  iteration2.csv for params, 0.052109
#iteration 5 params gives 40th percentile on Kaggle -- best ensembling score yet

# TODO:  Analyze different base models with gridsearch (RandomForest, Adaboost, Neural Networks, DecisionTree, Lasso)
# TODO:  remember to scale neural network training data before feeding it

############################################## Custom Catboost Class ###################################################

class MultiCatBoost(BaseEstimator):

    def __init__(self, parameters, cat_feature_inds):
        self.cat_feature_inds = cat_feature_inds
        self.models = []
        self.parameters = parameters
        for i in range(5):
            self.models.append(CatBoostRegressor(**self.parameters, random_seed=i))

    def fit(self, x_train, y_train):
        for i in range(5):
            self.models[i].fit(x_train, y_train, cat_features=self.cat_feature_inds)
        return self

    def predict(self, x_test):
        result = 0.0
        for model in self.models:
            print("predicting on catboost")
            result += model.predict(x_test, verbose=True)
        result /= 5
        return result


class MultiXGBoost(BaseEstimator):

    def __init__(self, parameters):
        self.models = []
        self.parameters = parameters
        for i in range(5):
            self.models.append(XGBRegressor(**self.parameters, random_state=i))

    def fit(self, x_train, y_train):
        for i in range(5):
            self.models[i].fit(x_train, y_train)
        return self

    def predict(self, x_test):
        result = 0.0
        for model in self.models:
            print("predicting on xgboost")
            result += model.predict(x_test)
        result /= 5
        return result


class MultiLightGBM(BaseEstimator):

    def __init__(self, parameters):
        self.models = []
        self.parameters = parameters
        for i in range(5):
            self.models.append(LGBMRegressor(**self.parameters, random_state=i))

    def fit(self, x_train, y_train):
        for i in range(5):
            self.models[i].fit(x_train, y_train)
        return self

    def predict(self, x_test):
        result = 0.0
        for model in self.models:
            print("predicting on lightgbm")
            result += model.predict(x_test)
        result /= 5
        return result

############################################## Helper methods ##########################################################

#Performs gridsearch on the "meta-learners" which predict on the first layer predictions
def evaluateSecondLayer(base_learners, x_train, y_train, meta_learners, param_dicts):
    in_layer = EnsembleTransformer()
    print("adding base learners to transformer")
    in_layer.add('stack', base_learners)

    preprocess = [in_layer]
    print("creating scorer")
    scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    evl = Evaluator(scorer, cv=4, verbose=1)
    print("fitting evaluator")
    evl.fit(x_train.values,
        y_train.values,
        meta_learners,
        param_dicts,
        preprocessing={'meta': preprocess},
        n_iter=30                            # bump this up to do a larger grid search
       )

    table = pd.DataFrame(evl.summary)
    table.to_html('HyperCatboost.html')
    table.to_csv('HypterCatboost.csv', index=False, header=False, sep='\t')


#Adds features to the dataset
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

def plot_learning_curve(estimator, x_train, y_train):
    title = "XGBoost as Second Layer Predictor"
    plt.figure()
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    print("calculating learning curve values")
    train_sizes, train_scores, test_scores = learning_curve(estimator, x_train, y_train, n_jobs=-1, 
                scoring = 'neg_mean_absolute_error', cv=4)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    
    plt.legend(loc="best")
    print('showing plot')
    plt.show()
    print(test_scores_mean)

def plot_validation_curve(estimator, x_train, y_train, param_name, param_range):
    print('computing validation curve values')
    train_scores, test_scores = validation_curve(estimator, x_train, y_train, param_name=param_name, 
                param_range=param_range, scoring='neg_mean_absolute_error', n_jobs=-1, cv=4)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("XGBoost colsample analysis")
    plt.xlabel("colsample")
    plt.ylabel("Score")
    lw = 2
    plt.plot(param_range, train_scores_mean, 'r+', label='Train')
    plt.plot(param_range, test_scores_mean, 'g+', label='Test')
    plt.legend(loc="best")
    plt.show()
    
    print(test_scores_mean)

########################################### LOADING DATA ##############################################################

dir_path = path.abspath(path.join('__file__',"../../.."))
train_path = dir_path + '/train_2016_v2.csv'
test_path = dir_path + '/submission.csv'
properties_path = dir_path + '/properties_2016.csv'
#first_layer_predictions_file = dir_path + '/predictions_first_layer.csv'

print(train_path)

train_df = pd.read_csv(train_path, parse_dates=['transactiondate'], low_memory=False)
test_df = pd.read_csv(test_path, low_memory=False)
properties = pd.read_csv(properties_path, low_memory=False)
# field is named differently in submission
test_df['parcelid'] = test_df['ParcelId']

########################################## PROCESSING DATA ############################################################

train_df = add_date_features(train_df)
train_df = train_df.merge(properties, how='left', on='parcelid')
test_df = test_df.merge(properties, how='left', on='parcelid')

#Identify columns with many missing values and store them into a variable
exclude_missing = missingValueColumns(train_df)

# Identify columns with only one unique value and store them into a variable
exclude_unique = nonUniqueColumns(train_df)

#Identify columns that we will use for training and store them into a variable
train_features = trainingColumns(train_df, exclude_missing, exclude_unique)

#Identify categorical columns
cat_feature_inds = categoricalColumns(train_df, train_features)

# Handle NA values
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

#Disregard outliers
train_df = train_df[train_df.logerror > -0.4]
train_df = train_df[train_df.logerror < 0.4]

#Initialize training datasets
x_train = train_df[train_features]
y_train = train_df.logerror

#Handle types so training does not throw errors
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

#Set up test dataset
test_df['transactiondate'] = pd.Timestamp('2016-12-01')  # Dummy
test_df = add_date_features(test_df)
X_test = test_df[train_features]

#Handle types so testing does not throw errors
for c in X_test.dtypes[X_test.dtypes == object].index.values:
    X_test[c] = (X_test[c] == True)

first_layer_results = pd.read_csv('predictions_first_layer.csv')
for c in first_layer_results.dtypes[first_layer_results.dtypes == object].index.values:
    first_layer_results[c] = (first_layer_results[c] == True)    

#x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=27000)

#run grid search on base models to hypertune parameters
#base_model_gridSearch(x_train, y_train)

############################################# Graph Learning Curve ####################################################

#estimator = Ridge(**ridge_params)
#estimator = XGBRegressor(**xgb_params_2)
#plot_learning_curve(estimator, first_layer_results, y_train)

############################################ Graph Validation Curve ###################################################

#value_range = [2, 3, 4, 5]
#plot_validation_curve(estimator, first_layer_results, y_train,'max_depth', value_range)

#######################################################################################################################

#GridSearch


#xgb_mod = MultiXGBoost(getXGBParams(y_train))
#lgbm_mod = MultiLightGBM(lightGBM_params)
#cat_mod = MultiCatBoost(catboost_params, cat_feature_inds)
#models = [xgb_mod, lgbm_mod, cat_mod]


def generate_first_layer_predictions():
    x_train = np.array(x_train)
    y_train = np.array(y_train)

    kf = KFold(n_splits=4)
    folds = list(kf.split(x_train, y_train))

    first_layer_train_predictions = np.zeros((x_train.shape[0], len(models)))

    #train first layer
    for i in range(len(models)):
        print("training baseline model")
        for j, (train_idx, test_idx) in enumerate(folds):
            x_train_fold = x_train[train_idx]
            y_train_fold = y_train[train_idx]
            x_holdout_fold = x_train[test_idx]
            y_holdout_fold = y_train[test_idx]
            models[i].fit(x_train_fold, y_train_fold)
            first_layer_train_predictions[test_idx, i] = models[i].predict(x_holdout_fold)

    print("first layer train predictions: ")
    print(first_layer_train_predictions)
    print("shape: ")
    print(first_layer_train_predictions.shape)
    print("building csv")
    np.savetxt("predictions_first_layer.csv", first_layer_train_predictions, delimiter=",")


"""
xgb_mod = XGBRegressor(**xgb_params_2)
ridge = Ridge()
lasso = Lasso()

base_learners = [('xgb_2', xgb_mod), ('Ridge', ridge), ('Lasso', lasso)]

param_dicts = {'xgb_2': 
               {'learning_rate': uniform(0.01, 0.12),
                    'subsample': uniform(0.8, 0.2),
                    'reg_lambda': uniform(0.4, 5),
                    'max_depth': randint(2, 5),
                    'reg_alpha': uniform(0.1, 4),
                    'n_estimators': randint(50, 400),
                    'colsample_bytree': uniform(0.8, 0.2)},
               'Ridge':
                  {'alpha': uniform(0.5, 15),
                   'max_iter': randint(1000, 12000),
                   'tol': uniform(0.00003, 0.0003),
                   },
               'Lasso':
                   {'alpha': uniform(0.5, 15),
                    'max_iter': randint(1000, 12000),
                    'tol': uniform(0.0003, 0.002),
                  }
              }

scorer = make_scorer(mean_absolute_error, greater_is_better=False)

evl = Evaluator(scorer,cv=4,verbose=True)

first_layer_results = pd.read_csv('predictions_first_layer.csv')
for c in first_layer_results.dtypes[first_layer_results.dtypes == object].index.values:
    first_layer_results[c] = (first_layer_results[c] == True)

print("shape: ")
print(first_layer_results.shape)


print('fitting')
evl.fit(first_layer_results.values,  
            y_train.values,
            estimators=base_learners,
            param_dicts=param_dicts,
            n_iter=100)  # bump this up to do a larger grid search

table = pd.DataFrame(evl.summary)
table.to_html('XGBRegressor_2.html')
table.to_csv('XGBRegressor_2.csv', index=False, header=False, sep='\t') """
                
"""
param_dicts = {'Ridge':
                  {'alpha': uniform(0.5, 15),
                   'max_iter': randint(1000, 12000),
                   'tol': uniform(0.00003, 0.0003),
                   },
               'Lasso':
                   {'alpha': uniform(0.5, 15),
                    'max_iter': randint(1000, 12000),
                    'tol': uniform(0.0003, 0.002),
                  }
              } """

#evaluateSecondLayer(base_learners, x_train, y_train, meta_learners, param_dicts)


########################################## Create and Train Ensembler ##################################################



ensemble = SuperLearner(folds=4)

print("adding baseline models to ensembler")

ensemble.add([MultiXGBoost(getXGBParams(y_train)), MultiLightGBM(lightGBM_params),
              MultiCatBoost(catboost_params, cat_feature_inds)])

#ensemble.add_meta(XGBRegressor(**xgb_params_2))
ensemble.add_meta(Ridge(**ridge_params))

print("training ensembler")
ensemble.fit(x_train, y_train)

######################################### PREDICTING ON ENSEMBLE #######################################################

print("predicting on ensembler")
preds = ensemble.predict(X_test)


""""#Validation prediction:

preds = ensemble.predict(x_val)
accuracy = mean_absolute_error(y_val, preds)
print('validation accuracy: ')
print(accuracy) """

######################################### BUILDING KAGGLE SUBMISSION ###################################################


print("building prediction submission: ")
sub = pd.read_csv(test_path)
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = preds

print('Writing csv ...')
sub.to_csv('kaggle_submission.csv', index=False, float_format='%.4f')

[MLENS] backend: threading


/home/joseph/kaggle/zillow/train_2016_v2.csv
We exclude: ['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag', 'taxdelinquencyflag', 'taxdelinquencyyear']
15
We exclude: ['transaction_year', 'buildingclasstypeid', 'decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'assessmentyear', 'taxdelinquencyflag']
12
We use these for training: ['transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid',

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


adding baseline models to ensembler
training ensembler
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on xgboost
predicting on xgboost
predicting on xgboost
predicting on xgboost
predicting on xgboost
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on catboost
predicting on catboost
predicting on catboost
predicting on catboost
predicting on catboost
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on xgboost
predicting on xgboost
predicting on xgboost
predicting on xgboost
predicting on xgboost
predicting on catboost
predicting on catboost
predicting on catboost
predicting on catboost
predicting on catboost
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on lightgbm
predicting on xgboost
predicting on