In [3]:
from mlens.ensemble import SuperLearner
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.stats import uniform, randint
from mlens.preprocessing import EnsembleTransformer
from mlens.metrics import make_scorer
from mlens.model_selection import Evaluator
from sklearn.base import BaseEstimator
from model_super import *
from models import *
from ensembler import *
from parameters import *
from data_processing import *
from gridSearch import *
import os.path as path
import sys
import matplotlib.pyplot as plt

# TODO:  Analyze different base models with gridsearch (RandomForest, Adaboost, Neural Networks, DecisionTree, Lasso)
# TODO:  remember to scale neural network training data before feeding it

############################################## Custom Classes #########################################################

class MultiCatBoost(BaseEstimator):

    def __init__(self, parameters, cat_feature_inds):
        self.cat_feature_inds = cat_feature_inds
        self.models = []
        self.parameters = parameters
        for i in range(5):
            self.models.append(CatBoostRegressor(**self.parameters, random_seed=i))

    def fit(self, x_train, y_train):
        for i in range(5):
            self.models[i].fit(x_train, y_train, cat_features=self.cat_feature_inds)
        return self

    def predict(self, x_test):
        result = 0.0
        for model in self.models:
            print("predicting on catboost")
            result += model.predict(x_test, verbose=True)
        result /= 5
        return result


class MultiXGBoost(BaseEstimator):

    def __init__(self, parameters):
        self.models = []
        self.parameters = parameters
        for i in range(5):
            self.models.append(XGBRegressor(**self.parameters, random_state=i))

    def fit(self, x_train, y_train):
        for i in range(5):
            self.models[i].fit(x_train, y_train)
        return self

    def predict(self, x_test):
        result = 0.0
        for model in self.models:
            print("predicting on xgboost")
            result += model.predict(x_test)
        result /= 5
        return result


class MultiLightGBM(BaseEstimator):

    def __init__(self, parameters):
        self.models = []
        self.parameters = parameters
        for i in range(5):
            self.models.append(LGBMRegressor(**self.parameters, random_state=i))

    def fit(self, x_train, y_train):
        for i in range(5):
            self.models[i].fit(x_train, y_train)
        return self

    def predict(self, x_test):
        result = 0.0
        for model in self.models:
            print("predicting on lightgbm")
            result += model.predict(x_test)
        result /= 5
        return result

############################################## Helper methods ##########################################################

#Performs gridsearch on the "meta-learners" which predict on the first layer predictions
def evaluateSecondLayer(base_learners, x_train, y_train, meta_learners, param_dicts):
    in_layer = EnsembleTransformer()
    print("adding base learners to transformer")
    in_layer.add('stack', base_learners)

    preprocess = [in_layer]
    print("creating scorer")
    scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    evl = Evaluator(scorer, cv=4, verbose=1)
    print("fitting evaluator")
    evl.fit(x_train.values,
        y_train.values,
        meta_learners,
        param_dicts,
        preprocessing={'meta': preprocess},
        n_iter=30                            # bump this up to do a larger grid search
       )

    table = pd.DataFrame(evl.summary)
    table.to_html('HyperCatboost.html')
    table.to_csv('HypterCatboost.csv', index=False, header=False, sep='\t')


#Adds features to the dataset
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df['sin_month'] = np.sin(df['transaction_month'] * np.pi/12)
    df['cos_month'] = np.sin(df['transaction_month'] * np.pi/12)
    df['sin_quarter'] = np.sin(df['transaction_quarter'] * np.pi/4)
    df['cos_quarter'] = np.sin(df['transaction_quarter'] * np.pi/4)
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

def add_geographic_features(property2016, property2017):
    property2016['year'] = 2016
    property2017['year'] = 2017
    complete = property2016.append(property2017)
    radian_lat = complete['latitude'] * np.pi/180
    radian_long = complete['longitude'] * np.pi/180
    earth_radius = 3959
    complete['x'] = (-earth_radius * np.cos(radian_lat) * np.sin(radian_long)).fillna(-1)
    complete['y'] = (earth_radius * np.sin(radian_lat)).fillna(-1)
    complete['z'] = (earth_radius * np.cos(radian_lat) * np.sin(radian_long)).fillna(-1)
    pca = PCA(n_components=2)
    rotated_latlong = pca.fit_transform(complete[['latitude', 'longitude']].fillna(-1))
    complete['latlong_pca0'] = rotated_latlong[:,0]
    complete['latlong_pca1'] = rotated_latlong[:,1]
    dropcols = ['year']
    return complete[complete.year == 2016].drop(dropcols, axis=1), complete[complete.year == 2017].drop(dropcols, axis=1)

def generate_first_layer_predictions():
    x_train = np.array(x_train)
    y_train = np.array(y_train)

    kf = KFold(n_splits=4)
    folds = list(kf.split(x_train, y_train))

    first_layer_train_predictions = np.zeros((x_train.shape[0], len(models)))

    #train first layer
    for i in range(len(models)):
        print("training baseline model")
        for j, (train_idx, test_idx) in enumerate(folds):
            x_train_fold = x_train[train_idx]
            y_train_fold = y_train[train_idx]
            x_holdout_fold = x_train[test_idx]
            y_holdout_fold = y_train[test_idx]
            models[i].fit(x_train_fold, y_train_fold)
            first_layer_train_predictions[test_idx, i] = models[i].predict(x_holdout_fold)

    print("first layer train predictions: ")
    print(first_layer_train_predictions)
    print("shape: ")
    print(first_layer_train_predictions.shape)
    print("building csv")
    np.savetxt("predictions_first_layer.csv", first_layer_train_predictions, delimiter=",")


def plot_learning_curve(estimator, x_train, y_train):
    title = "XGBoost as Second Layer Predictor"
    plt.figure()
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    print("calculating learning curve values")
    train_sizes, train_scores, test_scores = learning_curve(estimator, x_train, y_train, n_jobs=-1, 
                scoring = 'neg_mean_absolute_error', cv=4)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    
    plt.legend(loc="best")
    print('showing plot')
    plt.show()
    print(test_scores_mean)

def plot_validation_curve(estimator, x_train, y_train, param_name, param_range):
    print('computing validation curve values')
    train_scores, test_scores = validation_curve(estimator, x_train, y_train, param_name=param_name, 
                param_range=param_range, scoring='neg_mean_absolute_error', n_jobs=-1, cv=4)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("XGBoost colsample analysis")
    plt.xlabel("colsample")
    plt.ylabel("Score")
    lw = 2
    plt.plot(param_range, train_scores_mean, 'r+', label='Train')
    plt.plot(param_range, test_scores_mean, 'g+', label='Test')
    plt.legend(loc="best")
    plt.show()
    
    print(test_scores_mean)

########################################### LOADING DATA ##############################################################

dir_path = path.abspath(path.join('__file__',"../../.."))

train_path = dir_path + '/training_data_custom.csv'
test_2016_path = dir_path + '/test_2016_data_custom.csv'
test_2017_path = dir_path + '/test_2017_data_custom.csv'

#Load Train Data
train_data = pd.read_csv(train_path)
x_train = train_data.drop('logerror', 1)
y_train = train_data.logerror

#Identify categorical columns for catboost
train_features = list(x_train)
cat_feature_inds = categoricalColumns(x_train, train_features)

#Load Test Data
X_test_2016 = pd.read_csv(test_2016_path)
X_test_2017 = pd.read_csv(test_2017_path)
  

#x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=27000)

############################################# Graph Learning Curve ####################################################

#estimator = Ridge(**ridge_params)
#estimator = XGBRegressor(**xgb_params_2)
#plot_learning_curve(estimator, first_layer_results, y_train)

############################################ Graph Validation Curve ###################################################

#value_range = [2, 3, 4, 5]
#plot_validation_curve(estimator, first_layer_results, y_train,'max_depth', value_range)

########################################## Create and Train Ensembler ##################################################


ensemble = SuperLearner(folds=4)

print("adding baseline models to ensembler")

ensemble.add([MultiXGBoost(getXGBParams(y_train)), MultiLightGBM(lightGBM_params),
              MultiCatBoost(catboost_params, cat_feature_inds)])

#ensemble.add_meta(XGBRegressor(**xgb_params_2))
ensemble.add_meta(Ridge(**ridge_params))

print("training ensembler")
ensemble.fit(x_train, y_train)

######################################### PREDICTING ON ENSEMBLE #######################################################

print("predicting on ensembler")
preds = ensemble.predict(X_test)


""""#Validation prediction:

preds = ensemble.predict(x_val)
accuracy = mean_absolute_error(y_val, preds)
print('validation accuracy: ')
print(accuracy) """

######################################### BUILDING KAGGLE SUBMISSION ###################################################


print("building prediction submission: ")
sub = pd.read_csv(test_path)
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = preds

print('Writing csv ...')
sub.to_csv('kaggle_submission.csv', index=False, float_format='%.4f')

X train Shape: 
(164299, 51)
Y train Shape: 
(164299,)
['transaction_year', 'transaction_month', 'transaction_quarter', 'sin_month', 'cos_month', 'sin_quarter', 'cos_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyyear', 'censustractandblock', 'x', 'y', 'z', 'latlong_pca0', 'latlong_pca

KeyboardInterrupt: 