<center>
<img src="../../img/beeline_data_school_logo.png">
# "Прикладной анализ данных"
#### Интенсивный курс по изучению машинного обучения и анализа данных
<img src="../../img/beeline_logo.jpg" height="240" width="240">
## Автор материала: преподаватель Факультета Компьютерных Наук НИУ ВШЭ Кашницкий Юрий
</center>
Материал распространяется на условиях лицензии <a href="https://opensource.org/licenses/MS-RL">Ms-RL</a>. Можно использовать в любых целях, кроме коммерческих, но с обязательным упоминанием автора материала.

# Семинар 13. Продвинутые методы классификации и регрессии

## Часть 2. Использование API Scikit-learn

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
import sys
sys.path.append('../../scripts/')
from load_titanic_with_features import load_titanic
from sklearn import cross_validation

In [2]:
class MyBlackBox(BaseEstimator):
    def __init__(self, base_classifiers={GradientBoostingClassifier(): {},
                                        RandomForestClassifier(): {},
                                        LogisticRegression(): {}},
                                        verbose=True, n_jobs=-1, cv=5):
        self.base_classifiers = base_classifiers
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.cv = cv

    def get_params(self, deep=True):
        return BaseEstimator.get_params(self, deep=deep)

    def set_params(self, **params):
        return BaseEstimator.set_params(self, **params)

    def fit(self, X, y):
        self.clf_weights = {}
        for clf in self.base_classifiers:
            print(clf)
            params = self.base_classifiers[clf]
            current_best_clf = GridSearchCV(clf,
                                 params,
                                 verbose=self.verbose, n_jobs=self.n_jobs, cv=self.cv)
            current_best_clf.fit(X, y)
            self.clf_weights[current_best_clf.best_estimator_] = current_best_clf.best_score_
        print(self.clf_weights)

    def predict(self, X):
        final_predictions = np.zeros([X.shape[0], 1])
        sum_clf_weights = sum(self.clf_weights.values())
        clf_weights = [weight / sum_clf_weights
                       for weight in self.clf_weights.values()]

        for clf in self.clf_weights:
            final_predictions += self.clf_weights[clf] / sum_clf_weights * clf.predict(X).reshape([X.shape[0], 1])
        # print(final_predictions)
        return (final_predictions > 0.5).astype('int64')

In [3]:
%%time
X_train, y, X_test = load_titanic("../../data/titanic_train.csv",
                                "../../data/titanic_test.csv")


forest_params = {'criterion': ('gini', 'entropy'),
                 'n_estimators': list(range(50, 300, 50)),
                 'max_depth': list(range(1, 5)),
                 'min_samples_leaf': list(range(1, 5))}

gboost_params = {'learning_rate': [0.1, 0.2, 0.3],
                 'n_estimators': list(range(10, 100,20)),
                 'max_depth': list(range(1,5)),
                 'min_samples_leaf': list(range(1,5)),
                 'min_samples_split': list(range(1,5))}

log_reg_params = {'C': [0.1, 5, 10, 50]}

clf = MyBlackBox(base_classifiers={GradientBoostingClassifier(): gboost_params,
                                   RandomForestClassifier(): forest_params,
                                   LogisticRegression(): log_reg_params},
                     cv=3)

clf.fit(X_train, y)

scores = cross_validation.cross_val_score(clf, X_train,
                                          y, cv=3, scoring="accuracy")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Fitting 3 folds for each of 960 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 1800 jobs       | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 2450 jobs       | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 2874 out of 2880 | elapsed:  1.0min remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed:  1.0min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 474 out of 480 | elapsed:   29.9s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   30.5s finished


{GradientBoostingClassifier(init=None, learning_rate=0.3, loss='deviance',
              max_depth=2, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=3, min_samples_split=1,
              min_weight_fraction_leaf=0.0, n_estimators=90,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False): 0.85185185185185186, LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0): 0.8125701459034792, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False): 0.8316498316498

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 1800 jobs       | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done 2450 jobs       | elapsed:   49.9s
[Parallel(n_jobs=-1)]: Done 2874 out of 2880 | elapsed:  1.1min remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed:  1.1min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 474 out of 480 | elapsed:   38.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   39.5s finished


{LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0): 0.81818181818181823, GradientBoostingClassifier(init=None, learning_rate=0.2, loss='deviance',
              max_depth=2, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=2, min_samples_split=1,
              min_weight_fraction_leaf=0.0, n_estimators=90,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False): 0.83838383838383834, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False): 0.841750841

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 474 out of 480 | elapsed:   35.9s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   36.6s finished


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Fitting 3 folds for each of 960 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 1800 jobs       | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 2450 jobs       | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done 2874 out of 2880 | elapsed:  1.1min remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed:  1.1min finished


{GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=4, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=3, min_samples_split=1,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False): 0.81313131313131315, LogisticRegression(C=50, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0): 0.8158783783783784, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False): 0.81818181818

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   36.7s finished


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Fitting 3 folds for each of 960 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done 1800 jobs       | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done 2450 jobs       | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 2874 out of 2880 | elapsed:  1.3min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed:  1.3min finished


{LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0): 0.80134680134680136, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False): 0.82659932659932656, GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=4, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=4,
              min_weight_fraction_leaf=0.0, n_estimators=90,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False): 0.82659932659

In [5]:
print(np.mean(scores))

predictions = clf.predict(X_test)

predicted_df = pd.DataFrame(predictions,
                            index = np.arange(892, 892 + X_test.shape[0]),
                            columns=["Survived"])
predicted_df.to_csv("../../output/titanic_myblackbox.csv", 
                    index_label="PassengerId")

0.830527497194
