In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder

# read datasets
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [4]:
# add pca
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
    
    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

x_train = train.drop(["y"], axis=1).values
y_train = train['y'].values
# split train data
x_train_data,x_test_data,y_train_data,y_test_data = train_test_split(x_train,y_train,test_size=0.3,random_state=23)
print(type(x_train_data),x_train_data.shape)

Shape train: (4209, 428)
Shape test: (4209, 427)
<class 'numpy.ndarray'> (2946, 427)


In [5]:
# train a base xgb regressor
import xgboost
from sklearn.metrics import r2_score
def test_base():
    m = xgboost.XGBRegressor()
    print(m)
    m.fit(x_train_data,y_train_data)
    print(r2_score(y_train,m.predict(x_train)))
    print(r2_score(y_test_data,m.predict(x_test_data)))
    pred = m.predict(test.values)
    print(pred,pred.shape)
    output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
    output.to_csv('result/add_base_xgb.csv',index=False)
test_base()

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)




0.652118347623
0.550637170345
[  91.53730011  105.78723145   78.65650177 ...,   92.05712128  114.97119904
   91.95201111] (4209,)


In [6]:
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from copy import copy
    
class StackedGeneralizer(object):
    """Base class for stacked generalization classifier models
    """

    def __init__(self, base_models=None, blending_model=None, n_folds=5, verbose=True):
        """
        Stacked Generalizer Classifier

        Trains a series of base models using K-fold cross-validation, then combines
        the predictions of each model into a set of features that are used to train
        a high-level classifier model. 

        Parameters
        -----------
        base_models: list of classifier models
            Each model must have a .fit and .predict_proba/.predict method a'la
            sklearn
        blending_model: object
            A classifier model used to aggregate the outputs of the trained base
            models. Must have a .fit and .predict_proba/.predict method
        n_folds: int
            The number of K-folds to use in =cross-validated model training
        verbose: boolean

        Example
        -------

        from sklearn.datasets import load_digits
        from stacked_generalizer import StackedGeneralizer
        from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
        from sklearn.linear_model import LogisticRegression
        import numpy as np

        logger = Logger('test_stacked_generalizer')

        VERBOSE = True
        N_FOLDS = 5
        
        # load data and shuffle observations
        data = load_digits()

        X = data.data
        y = data.target

        shuffle_idx = np.random.permutation(y.size)

        X = X[shuffle_idx]
        y = y[shuffle_idx]

        # hold out 20 percent of data for testing accuracy
        n_train = round(X.shape[0]*.8)

        # define base models
        base_models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
                       RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
                       ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini')]

        # define blending model
        blending_model = LogisticRegression()

        # initialize multi-stage model
        sg = StackedGeneralizer(base_models, blending_model, 
                                n_folds=N_FOLDS, verbose=VERBOSE)

        # fit model
        sg.fit(X[:n_train],y[:n_train])

        # test accuracy
        pred = sg.predict(X[n_train:])
        pred_classes = [np.argmax(p) for p in pred]

        _ = sg.evaluate(y[n_train:], pred_classes)

                     precision    recall  f1-score   support

                  0       0.97      1.00      0.99        33
                  1       0.97      1.00      0.99        38
                  2       1.00      1.00      1.00        42
                  3       1.00      0.98      0.99        41
                  4       0.97      0.94      0.95        32
                  5       0.95      0.98      0.96        41
                  6       1.00      0.95      0.97        37
                  7       0.94      0.97      0.96        34
                  8       0.94      0.94      0.94        34
                  9       0.96      0.96      0.96        27

        avg / total       0.97      0.97      0.97       359
        """
        self.base_models = base_models
        self.blending_model = blending_model
        self.n_folds = n_folds
        self.verbose = verbose
        self.base_models_cv = []
    
    def fit(self, X, y):
        self.fit_base_models(X, y)
        print('fit base models done')
        self.fit_blending_model(X, y)
        print('fit blend model done')
    
    def fit_base_models(self, X, y):
        if self.verbose:
            print('Fitting Base Models...')

        kf = list(KFold(y.shape[0], self.n_folds))

        for i, model in enumerate(self.base_models):    
            for j, (train_idx, test_idx) in enumerate(kf):
                if self.verbose:
                    print('Fold %d' % (j + 1))
                # print(X.shape,min(train_idx),max(train_idx),len(train_idx),type(X))
                X_train = X[train_idx]
                y_train = y[train_idx]

                model.fit(X_train, y_train)

                # add trained model to list of CV'd models
                self.base_models_cv.append(copy(model))
        

    def fit_blending_model(self,X, y):
        if self.verbose:
            model_name = "%s" % self.blending_model.__repr__()
            print('Fitting Blending Model:\n%s' % model_name)

        predictions = []
        print('model cnts',len(self.base_models_cv))
        for m in self.base_models_cv:
            base_res = m.predict(X)
            print(base_res.shape)
            predictions.append(base_res)

        # transpose base model output as blend model input
        blend_x = np.array(predictions).transpose()
        print('blend_shape',blend_x.shape)
        self.blending_model.fit(blend_x,y)
    
    def predict(self, X):
        # perform model averaging to get predictions
        predictions = []
        for m in self.base_models_cv:
            predictions.append(m.predict(X))

        # transpose base model output as blend model input
        blend_x = np.array(predictions).transpose()
        return self.blending_model.predict(blend_x)

    def evaluate(self, y, y_pred):
        print(classification_report(y, y_pred))
        print('Confusion Matrix:')
        print(confusion_matrix(y, y_pred))
        return accuracy_score(y, y_pred)

In [7]:
# try stack generalization
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import Ridge, LinearRegression

def stack_test():
    VERBOSE = True
    N_FOLDS = 3

    base_models = [#RandomForestRegressor(n_estimators=20),
                   #RandomForestRegressor(n_estimators=50), 
                   #ExtraTreesRegressor(),
                   #AdaBoostRegressor(),
                   #Ridge(),
                   #LinearRegression(),
                   xgboost.XGBRegressor(n_estimators=20),
                   xgboost.XGBRegressor(subsample=0.5),
                   xgboost.XGBRegressor(max_depth=10)
                  ]
    blending_model = xgboost.XGBRegressor()
    
    # initialize multi-stage model
    sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE)
    sg.fit(x_train_data,y_train_data)
    
    print(r2_score(y_train,sg.predict(x_train).flatten()))
    print(r2_score(y_test_data,sg.predict(x_test_data).flatten()))
    pred = sg.predict(test.values).flatten()
    print(pred,pred.shape)
    output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
    output.to_csv('result/add_stack_sklearn.csv',index=False)

stack_test()

Fitting Base Models...
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
fit base models done
Fitting Blending Model:
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
model cnts 9
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
blend_shape (2946, 9)
fit blend model done
0.832992017236
0.457162856673
[  81.19771576  100.81538391   80.2059021  ...,   97.37402344  112.05316162
   94.9573288 ] (4209,)


In [8]:
def stack_test():
    VERBOSE = True
    N_FOLDS = 3

    base_models = [RandomForestRegressor(n_estimators=20),
                   RandomForestRegressor(n_estimators=50), 
                   ExtraTreesRegressor(),
                   AdaBoostRegressor(),
                   Ridge(),
                   LinearRegression(),
                   xgboost.XGBRegressor(n_estimators=20),
                   xgboost.XGBRegressor(subsample=0.5),
                   xgboost.XGBRegressor(max_depth=10)
                  ]
    blending_model = xgboost.XGBRegressor()
    
    # initialize multi-stage model
    sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE)
    sg.fit(x_train_data,y_train_data)
    
    print(r2_score(y_train,sg.predict(x_train).flatten()))
    print(r2_score(y_test_data,sg.predict(x_test_data).flatten()))
    pred = sg.predict(test.values).flatten()
    print(pred,pred.shape)
    output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
    output.to_csv('result/add_stack_more_model_sklearn.csv',index=False)

stack_test()

Fitting Base Models...
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
fit base models done
Fitting Blending Model:
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
model cnts 27
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
blend_shape (2946, 27)
fit blend model done
0.837295571899
0.438299336661
[  80.12405396  117.0450058    78.28937531 ...,   96.78491211  110.16893005
   92.93627167] (4209,)


In [9]:
def stack_test():
    VERBOSE = True
    N_FOLDS = 3

    base_models = [RandomForestRegressor(n_estimators=20),
                   RandomForestRegressor(n_estimators=50), 
                   ExtraTreesRegressor(),
                   AdaBoostRegressor(),
                   Ridge(),
                   LinearRegression(),
                   xgboost.XGBRegressor(n_estimators=20),
                   xgboost.XGBRegressor(subsample=0.5),
                   xgboost.XGBRegressor(max_depth=10)
                  ]
    blending_model = xgboost.XGBRegressor()
    
    # initialize multi-stage model
    sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE)
    sg.fit(x_train_data,y_train_data)
    
    print(r2_score(y_train,sg.predict(x_train).flatten()))
    print(r2_score(y_test_data,sg.predict(x_test_data).flatten()))
    pred = sg.predict(test.values).flatten()
    print(pred,pred.shape)
    output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
    output.to_csv('result/add_stack_more_model_all_data_sklearn.csv',index=False)

stack_test()

Fitting Base Models...
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
Fold 1
Fold 2
Fold 3
fit base models done
Fitting Blending Model:
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
model cnts 27
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
(2946,)
blend_shape (2946, 27)
fit blend model done
0.838056654323
0.443171631076
[  79.36005402  122.00016785   77.81624603 ...,  102.04833984  109.56017303
   91.28230286] (4209,)
