#  Домашнее задание:

Используя функцию sklearn.model_selection.GridSearchCV определите наилучшую комбинацию параметров для разных методов. Попробуйте также sklearn.model_selection.RandomizedSearchCV. Сделайте вывод об этих двух функциях, основываясь на полученном опыте (когда каким удобнее пользоваться и почему).

* Импортируем необходимые библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

* Считываем данные

In [2]:
bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')
bioresponce.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
bioresponce.shape

(3751, 1777)

* Разбиваем данные

In [4]:
y = bioresponce.Activity.values
X = bioresponce.iloc[:, 1:]
cv = StratifiedKFold(n_splits=5)

## Напишем функции для проверки параметров моделей

* GridSearchCV

In [5]:
def grid_search_cv_launch(classifier, parameters_grid):
    grid_cv = GridSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv)
    grid_cv.fit(X, y)
    print(grid_cv.best_estimator_)
    print(grid_cv.best_score_)

* RandomizedSearchCV

In [6]:
def randomized_search_cv_launch(classifier, parameters_grid):
    randomized_grid_cv = RandomizedSearchCV(classifier, parameters_grid, scoring = 'accuracy',
                                            cv = cv, n_iter = 10, random_state = 21)
    randomized_grid_cv.fit(X, y)
    print(randomized_grid_cv.best_estimator_)
    print(randomized_grid_cv.best_score_)

## KNeighborsClassifier

http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [7]:
classifier = KNeighborsClassifier()

In [8]:
classifier.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [9]:
parameters_grid = {
    'n_neighbors' : range(3, 7),
    'weights' : ['uniform'],
    'algorithm' : ['auto'],
    'leaf_size' : range(20, 41, 5),
    'p' : [1, 2],
    'n_jobs' : [-1]
}

* GridSearchCV

In [10]:
%%time
grid_search_cv_launch(classifier, parameters_grid)

KNeighborsClassifier(algorithm='auto', leaf_size=20, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='uniform')
0.749400159957
Wall time: 1h 12min 14s


* RandomizedSearchCV

In [11]:
%%time
randomized_search_cv_launch(classifier, parameters_grid)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='uniform')
0.749400159957
Wall time: 18min 37s


## DecisionTreeClassifier

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [27]:
classifier = DecisionTreeClassifier()

In [28]:
classifier.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

In [29]:
parameters_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_split': range(2, 5)
}

* GridSearchCV

In [30]:
%%time
grid_search_cv_launch(classifier, parameters_grid)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.732338043188
Wall time: 2min 26s


* RandomizedSearchCV

In [31]:
%%time
randomized_search_cv_launch(classifier, parameters_grid)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.726206344975
Wall time: 53.4 s


## LinearSVC

http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

In [7]:
classifier = LinearSVC()

In [8]:
classifier.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

In [9]:
parameters_grid = {
    'multi_class': ['ovr', 'crammer_singer'],
    'max_iter': range(600, 1401, 200),
    'C' : [0.5, 1., 2.],
    'tol': [0.0001, 0.0005, 0.001]
}

* GridSearchCV

In [10]:
%%time
grid_search_cv_launch(classifier, parameters_grid)

LinearSVC(C=0.5, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=600,
     multi_class='crammer_singer', penalty='l2', random_state=None,
     tol=0.001, verbose=0)
0.745934417489
Wall time: 5h 36min 10s


* RandomizedSearchCV

In [11]:
%%time
randomized_search_cv_launch(classifier, parameters_grid)

LinearSVC(C=0.5, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=600,
     multi_class='crammer_singer', penalty='l2', random_state=None,
     tol=0.0001, verbose=0)
0.74540122634
Wall time: 55min 17s


## RandomForestClassifier

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [42]:
classifier = RandomForestClassifier()

In [43]:
classifier.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [44]:
parameters_grid = {
    'n_estimators': range(70, 111, 10),
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_leaf': range(1, 5),
    'n_jobs' : [-1]
}

* GridSearchCV

In [45]:
%%time
grid_search_cv_launch(classifier, parameters_grid)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.803519061584
Wall time: 2h 6min 57s


* RandomizedSearchCV

In [46]:
%%time
randomized_search_cv_launch(classifier, parameters_grid)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.798986936817
Wall time: 10min 7s


## GradientBoostingClassifier

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [12]:
classifier = GradientBoostingClassifier()

In [13]:
classifier.get_params()

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

In [14]:
parameters_grid = {
    'loss': ['deviance', 'exponential'],
    'n_estimators': range(70, 111, 10),
    'criterion': ['friedman_mse', 'mse', 'mae'],
    'max_features': [None, 'sqrt', 'log2'],
}

* GridSearchCV

In [None]:
%%time
grid_search_cv_launch(classifier, parameters_grid)

* RandomizedSearchCV

In [None]:
%%time
randomized_search_cv_launch(classifier, parameters_grid)

* код исполнялся более 11 часов, но не дощитался =(

# Итог

* Был испробован подбор параметров моделей с помощью поиска по сетке и случайного поиска по сетке.


* Использованные методы - GridSearchCV и RandomizedSearchCV.


* GridSearchCV производил перебор всех возможных комбинаций из заданных параметров, поэтому время работы на многих классификатор было очень большим. Лучший результат был показан на RandomForestClassifier(0.803519061584)


* RandomizedSearchCV запускался лишь на 10 случайных комбинациях параметров, а время работы было меньше в разы по сравнению с полным перебором. Результат в среднем был показан чуже, чем при GridSearchCV, но в алгоритме KNeighborsClassifier была найдена оптимальная комбинация параметров. То есть при значительно меньшем времени работы RandomizedSearchCV может выдать оптимальные параметры из заданного нами диапазона.


* RandomizedSearchCV в отличии от GridSearchCV лучше использовать при небольшой выборке или маленькой сетке, когда обучение происходит быстро.