In [46]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder

# read datasets
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

Shape train: (4209, 378)
Shape test: (4209, 377)


In [47]:
# add pca
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

n_comp = 20

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
    
    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

x_train = train.drop(["y"], axis=1).values
y_train = train['y'].values
# split train data
x_train_data,x_test_data,y_train_data,y_test_data = train_test_split(x_train,y_train,test_size=0.3,random_state=230)
print(type(x_train_data),x_train_data.shape)

Shape train: (4209, 438)
Shape test: (4209, 437)
<class 'numpy.ndarray'> (2946, 437)


In [48]:
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from copy import copy
    
class StackedGeneralizer(object):
    """Base class for stacked generalization classifier models
    """

    def __init__(self, base_models=None, blending_model=None, n_folds=5, verbose=True):
        """
        Stacked Generalizer Classifier

        Trains a series of base models using K-fold cross-validation, then combines
        the predictions of each model into a set of features that are used to train
        a high-level classifier model. 

        Parameters
        -----------
        base_models: list of classifier models
            Each model must have a .fit and .predict_proba/.predict method a'la
            sklearn
        blending_model: object
            A classifier model used to aggregate the outputs of the trained base
            models. Must have a .fit and .predict_proba/.predict method
        n_folds: int
            The number of K-folds to use in =cross-validated model training
        verbose: boolean

        Example
        -------

        from sklearn.datasets import load_digits
        from stacked_generalizer import StackedGeneralizer
        from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
        from sklearn.linear_model import LogisticRegression
        import numpy as np

        logger = Logger('test_stacked_generalizer')

        VERBOSE = True
        N_FOLDS = 5
        
        # load data and shuffle observations
        data = load_digits()

        X = data.data
        y = data.target

        shuffle_idx = np.random.permutation(y.size)

        X = X[shuffle_idx]
        y = y[shuffle_idx]

        # hold out 20 percent of data for testing accuracy
        n_train = round(X.shape[0]*.8)

        # define base models
        base_models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
                       RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
                       ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini')]

        # define blending model
        blending_model = LogisticRegression()

        # initialize multi-stage model
        sg = StackedGeneralizer(base_models, blending_model, 
                                n_folds=N_FOLDS, verbose=VERBOSE)

        # fit model
        sg.fit(X[:n_train],y[:n_train])

        # test accuracy
        pred = sg.predict(X[n_train:])
        pred_classes = [np.argmax(p) for p in pred]

        _ = sg.evaluate(y[n_train:], pred_classes)

                     precision    recall  f1-score   support

                  0       0.97      1.00      0.99        33
                  1       0.97      1.00      0.99        38
                  2       1.00      1.00      1.00        42
                  3       1.00      0.98      0.99        41
                  4       0.97      0.94      0.95        32
                  5       0.95      0.98      0.96        41
                  6       1.00      0.95      0.97        37
                  7       0.94      0.97      0.96        34
                  8       0.94      0.94      0.94        34
                  9       0.96      0.96      0.96        27

        avg / total       0.97      0.97      0.97       359
        """
        self.base_models = base_models
        self.blending_model = blending_model
        self.n_folds = n_folds
        self.verbose = verbose
        self.base_models_cv = []
        self.blending_model_cv = []
    
    def fit(self, X, y):
        self.fit_base_models(X, y)
        print('fit base models done')
        self.fit_blending_model(X, y)
        print('fit blend model done')
    
    def fit_base_models(self, X, y):
        if self.verbose:
            print('Fitting Base Models...')

        kf = list(KFold(y.shape[0], self.n_folds))

        for i, model in enumerate(self.base_models):    
            for j, (train_idx, test_idx) in enumerate(kf):
                if self.verbose:
                    print('Fold %d' % (j + 1))
                # print(X.shape,min(train_idx),max(train_idx),len(train_idx),type(X))
                X_train = X[train_idx]
                y_train = y[train_idx]

                model.fit(X_train, y_train)

                # add trained model to list of CV'd models
                self.base_models_cv.append(copy(model))
        

    def fit_blending_model(self,X, y):
        if self.verbose:
            model_name = "%s" % self.blending_model.__repr__()
            print('Fitting Blending Model:\n%s' % model_name)

        predictions = []
        print('model cnts',len(self.base_models_cv))
        for m in self.base_models_cv:
            base_res = m.predict(X)
            predictions.append(base_res)

        # transpose base model output as blend model input
        blend_x = np.array(predictions).transpose()
        print('blend_shape',blend_x.shape)
        kf = list(KFold(y.shape[0], self.n_folds))
        tmp_model = self.blending_model
        for j, (train_idx, test_idx) in enumerate(kf):
            if self.verbose:
                print('Fold %d' % (j + 1))
            # print(X.shape,min(train_idx),max(train_idx),len(train_idx),type(X))
            X_train = blend_x[train_idx]
            y_train = y[train_idx]

            tmp_model.fit(X_train, y_train)

            # add trained model to list of CV'd models
            self.blending_model_cv.append(copy(tmp_model))
    
    def predict(self, X):
        # perform model averaging to get predictions
        predictions = []
        for m in self.base_models_cv:
            predictions.append(m.predict(X))

        # transpose base model output as blend model input
        blend_x = np.array(predictions).transpose()
        blend_res = np.array([m.predict(blend_x) for m in self.blending_model_cv]).transpose()
        print('blend_res_shape',blend_res.shape)
        pred_res = np.mean(blend_res,axis=1)
        print('final_res_shape',pred_res.shape)
        return pred_res

    def evaluate(self, y, y_pred):
        print(classification_report(y, y_pred))
        print('Confusion Matrix:')
        print(confusion_matrix(y, y_pred))
        return accuracy_score(y, y_pred)

In [54]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import LinearSVR
from sklearn.metrics import r2_score
import xgboost
def stack_test():
    VERBOSE = False
    N_FOLDS = 5

    base_models = [
                   xgboost.XGBRegressor(subsample=0.95, learning_rate=0.04),
                   xgboost.XGBRegressor(subsample=0.75, learning_rate=0.04),
                   xgboost.XGBRegressor(subsample=0.65, learning_rate=0.04),
                  ]
    #blending_model = xgboost.XGBRegressor()
    blending_model = LinearRegression()

    # initialize multi-stage model
    sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE)
    sg.fit(x_train_data,y_train_data)
    
    print(r2_score(y_train,sg.predict(x_train).flatten()))
    test_score = r2_score(y_test_data,sg.predict(x_test_data).flatten())
    print(test_score)
    pred = sg.predict(test.values).flatten()
    print(pred,pred.shape)
    output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
    output.to_csv('result/cv_blend_{}.csv'.format(test_score),index=False)

stack_test()

fit base models done
model cnts 15
blend_shape (2946, 15)
fit blend model done
blend_res_shape (4209, 5)
final_res_shape (4209,)
0.624064504635
blend_res_shape (1263, 5)
final_res_shape (1263,)
0.588625916952
blend_res_shape (4209, 5)
final_res_shape (4209,)
[  83.9624639   101.06069488   87.36056512 ...,   90.69801331  113.18206346
   92.07387539] (4209,)


In [50]:
def stack_fold_test(fold):
    VERBOSE = False
    N_FOLDS = fold

    base_models = [
                   xgboost.XGBRegressor(subsample=0.95, learning_rate=0.03),
                   xgboost.XGBRegressor(subsample=0.75, learning_rate=0.03),
                   xgboost.XGBRegressor(subsample=0.65, learning_rate=0.03),
                  ]
    #blending_model = xgboost.XGBRegressor()
    blending_model = LinearRegression()

    # initialize multi-stage model
    sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE)
    sg.fit(x_train_data,y_train_data)
    
    print(r2_score(y_train,sg.predict(x_train).flatten()))
    test_score = r2_score(y_test_data,sg.predict(x_test_data).flatten())
    print(test_score)
    pred = sg.predict(test.values).flatten()
    print(pred,pred.shape)
    output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
    output.to_csv('result/cv_blend_{}_{}.csv'.format(test_score,fold),index=False)

for i in range(2,5):
    stack_fold_test(i)

fit base models done
model cnts 6
blend_shape (2946, 6)
fit blend model done
blend_res_shape (4209, 2)
final_res_shape (4209,)
0.60181543917
blend_res_shape (1263, 2)
final_res_shape (1263,)
0.597962534947
blend_res_shape (4209, 2)
final_res_shape (4209,)
[  78.61691484   98.5378823    81.30592859 ...,   93.9822691   110.95421578
   92.73007013] (4209,)
fit base models done
model cnts 9
blend_shape (2946, 9)
fit blend model done
blend_res_shape (4209, 3)
final_res_shape (4209,)
0.607711292399
blend_res_shape (1263, 3)
final_res_shape (1263,)
0.588461092634
blend_res_shape (4209, 3)
final_res_shape (4209,)
[  79.42423125   95.59267969   80.12152572 ...,   92.19116764  111.01278085
   92.81338837] (4209,)
fit base models done
model cnts 12
blend_shape (2946, 12)
fit blend model done
blend_res_shape (4209, 4)
final_res_shape (4209,)
0.610660121912
blend_res_shape (1263, 4)
final_res_shape (1263,)
0.580491699714
blend_res_shape (4209, 4)
final_res_shape (4209,)
[  77.71772634   94.86208869

In [None]:
# use all data to gen res
def all_data_stack_fold_test(fold):
    VERBOSE = False
    N_FOLDS = fold

    base_models = [
                   xgboost.XGBRegressor(subsample=0.95, learning_rate=0.03),
                   xgboost.XGBRegressor(subsample=0.75, learning_rate=0.03),
                   xgboost.XGBRegressor(subsample=0.65, learning_rate=0.03),
                  ]
    #blending_model = xgboost.XGBRegressor()
    blending_model = LinearRegression()

    # initialize multi-stage model
    sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE)
    sg.fit(x_train,y_train)
    
    print(r2_score(y_train,sg.predict(x_train).flatten()))
    test_score = r2_score(y_test_data,sg.predict(x_test_data).flatten())
    print(test_score)
    pred = sg.predict(test.values).flatten()
    print(pred,pred.shape)
    output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
    output.to_csv('result/all_data_cv_blend_{}_{}.csv'.format(test_score,fold),index=False)
all_data_stack_fold_test()

In [51]:
def stack_sample_rate_test(rate):
    print('rate used',rate)
    VERBOSE = False
    N_FOLDS = 2
    base_models = []
    for i in range(10,20):
        base_models.append(xgboost.XGBRegressor(subsample=0.05*i,learning_rate=rate))
    #blending_model = xgboost.XGBRegressor()
    blending_model = LinearRegression()

    # initialize multi-stage model
    sg = StackedGeneralizer(base_models, blending_model, n_folds=N_FOLDS, verbose=VERBOSE)
    sg.fit(x_train_data,y_train_data)
    
    print(r2_score(y_train,sg.predict(x_train).flatten()))
    test_score = r2_score(y_test_data,sg.predict(x_test_data).flatten())
    print(test_score)
    pred = sg.predict(test.values).flatten()
    print(pred,pred.shape)
    output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
    output.to_csv('result/cv_blend_{}_{}.csv'.format(test_score,rate),index=False)

for i in range(10):
    stack_sample_rate_test(0.36-i*0.02)
    print('--------------')

rate used 0.36
fit base models done
model cnts 20
blend_shape (2946, 20)
fit blend model done
blend_res_shape (4209, 2)
final_res_shape (4209,)
0.715807309105
blend_res_shape (1263, 2)
final_res_shape (1263,)
0.562342410207
blend_res_shape (4209, 2)
final_res_shape (4209,)
[  84.29193199  108.26766319   88.70576055 ...,   97.36413124  116.95089509
   96.93272605] (4209,)
--------------
rate used 0.33999999999999997
fit base models done
model cnts 20
blend_shape (2946, 20)
fit blend model done
blend_res_shape (4209, 2)
final_res_shape (4209,)
0.707683874933
blend_res_shape (1263, 2)
final_res_shape (1263,)
0.556151700712
blend_res_shape (4209, 2)
final_res_shape (4209,)
[  85.3440809   106.69686994   86.01314138 ...,   96.65180167  117.44556325
   96.87187643] (4209,)
--------------
rate used 0.32
fit base models done
model cnts 20
blend_shape (2946, 20)
fit blend model done
blend_res_shape (4209, 2)
final_res_shape (4209,)
0.708440415751
blend_res_shape (1263, 2)
final_res_shape (1263,