In [22]:
import pandas as pd
import numpy as np
import sklearn 
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)

In [2]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.20.1.


In [3]:
from sklearn.model_selection import KFold

In [4]:
train_data = pd.read_csv('train_clean.csv')
test_data = pd.read_csv('test_clean.csv')

In [6]:
train_data.shape

(891, 14)

In [12]:
test_data.shape

(418, 13)

In [37]:
no_train = train_data.shape[0]
no_test = test_data.shape[0]
nfolds =5
seed = 0

In [150]:
kf = KFold(n_splits= nfolds, random_state= seed)

In [151]:
class SkHelper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [152]:
def oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((no_train,))
    oof_test = np.zeros((no_test,))
    oof_test_skf = np.empty((nfolds, no_test))

    for i, (train_index, test_index) in enumerate(kf.split(train_data.values)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [153]:
params = {
    'rf': {
        'n_jobs': -1,
        'n_estimators': 500,
         'warm_start': True, 
         #'max_features': 0.2,
        'max_depth': 6,
        'min_samples_leaf': 2,
        'max_features' : 'sqrt',
        'verbose': 0
    },
    
    'et': {
        'n_jobs': -1,
        'n_estimators':500,
        #'max_features': 0.5,
        'max_depth': 8,
        'min_samples_leaf': 2,
        'verbose': 0
    },
    
    'ada': {
        'n_estimators': 500,
        'learning_rate' : 0.75
    },
    
    'gb': {
        'n_estimators': 500,
         #'max_features': 0.2,
        'max_depth': 5,
        'min_samples_leaf': 2,
        'verbose': 0
    },
    
    'svc': {
        'kernel' : 'linear',
        'C' : 0.025
    }
}

In [154]:
rf = SkHelper(clf=RandomForestClassifier, seed= seed, params= params['rf'])
et = SkHelper(clf=ExtraTreesClassifier, seed= seed, params= params['et'])
ada = SkHelper(clf=AdaBoostClassifier, seed= seed, params= params['ada'])
gb = SkHelper(clf=GradientBoostingClassifier, seed= seed, params= params['gb'])
svc = SkHelper(clf=SVC, seed= seed, params= params['svc'])

In [157]:
y_train = train_data['Survived'].ravel()
train_data = train_data.drop(['Survived'], axis=1)
x_train = train_data.values # Creates an array of the train data
x_test = test_data.values # Creats an array of the test data

In [158]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_tes = oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = oof(svc,x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")

  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Training is complete


In [164]:
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

  warn("Warm-start fitting without increasing n_estimators does not "


[0.05660262 0.05650975 0.11676808 0.20335274 0.02764599 0.02967261
 0.01940856 0.07093815 0.01784622 0.26342713 0.05806903 0.01519022
 0.0645689 ]
[0.0215872  0.02130417 0.11313358 0.36286868 0.02689778 0.0286179
 0.01638768 0.0544572  0.0249742  0.17932573 0.03837175 0.02252245
 0.08955166]
[0.388 0.434 0.012 0.01  0.014 0.018 0.024 0.008 0.008 0.046 0.03  0.002
 0.006]
[0.16452385 0.15211365 0.07369572 0.01058721 0.03498316 0.00602976
 0.00519684 0.03603608 0.0161009  0.37568827 0.08521811 0.00617449
 0.03365196]


In [182]:
base_train_preds = pd.DataFrame({
    'ExtraTrees': et_oof_train.ravel(),
    'RandomForrest': rf_oof_train.ravel(),
    'AdaBoost': ada_oof_train.ravel(),
    'GradientBoost': gb_oof_train.ravel(),
    'SupportVector': svc_oof_train.ravel()
})

In [183]:
base_train_preds.head()

Unnamed: 0,ExtraTrees,RandomForrest,AdaBoost,GradientBoost,SupportVector
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,1.0,1.0
2,0.0,1.0,0.0,1.0,1.0
3,1.0,1.0,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0


In [190]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [191]:
gbm1 = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm1.predict(x_test)

In [193]:
predictions.shape

(418,)

In [195]:
gender_submissions = pd.read_csv('gender_submission.csv')

In [198]:
StackingSubmission = pd.DataFrame({ 'PassengerId': gender_submissions['PassengerId'],
                            'Survived': predictions })
StackingSubmission.to_csv("StackingSubmission_test1.csv", index=False)

In [199]:
StackingSubmission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
