Задача supervised обучения
Когда есть правильные ответы, придумать правильный алгоритм, который давал мы минимум функции потерь
сумма L(a(x), y) -> min
L - функция потерь
Y = 0,1 - задача бинарной классификацции (спам/не спам)
Y = 1,2,..C - задача мультиклассовой классификации (категоризация)
Y = R - задача регрессии (прогнозирование спроса)
Задача unsupervised (найти похожих пользователей, задача классификации, задача поиска аномалий)

Переобучение - хорошо предсказываем на обучающей выборке, плохо предсказываем на новом объекте. (пример с апроксимацией полинома 8 степени, лучше сделать линейную зависимость).
Кросс-валидация - разбиваем данные на 2 части (train and test). Обучаем на train, тестим алгоритм на test. Начало переобучения - когда ошибка на train уменьшается, а ошибка на test в какой-то момент начинает расти. Подобрать параметры, которые на качестве test максимальны.
Может не повезти и разделение будет неравномерным на train and test. Можно разбить на k раз. k cross validation. Усредняется качество. 

# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Готовим обучающую и тестовую выборки

In [1]:
import pandas as pd

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [2]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
y = bioresponce.Activity.values
y

array([1, 1, 1, ..., 0, 1, 0])

In [4]:
X = bioresponce.iloc[:, 1:]
X.head()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,0.243144,...,0,0,0,0,0,0,0,0,0,0
1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,0.10648,...,1,1,1,1,0,1,0,0,1,0
2,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,0.352308,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,0.208989,...,0,0,0,0,0,0,0,0,0,0
4,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,0.125177,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Строим модель и оцениваем качество

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [8]:
type(preds)

numpy.ndarray

In [9]:
preds == y_test

array([False,  True,  True, ..., False, False, False], dtype=bool)

In [10]:
from __future__ import division
print(sum(preds == y_test) / len(preds))

0.75605815832


In [11]:
print(sum(preds == y_test) / float(len(preds)))

0.75605815832


In [12]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.75605815832


### Качество на кросс-валидации

In [13]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(model, X_train, y_train, cv=5))

[ 0.74404762  0.73956262  0.72310757  0.75099602  0.75896414]


In [14]:
print(cross_val_score(model, X_train, y_train, cv=5).mean())

0.743335594477


### Пробуем другие классификаторы

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [17]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

0.718901453958 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.711631663974 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.74071082391 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.782714054927 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
  

## Опциональное задание:

Попробуйте разные классификаторы с разными параметрами и постарайтесь добиться максимального качества на тестовой выборке

In [16]:
from sklearn.model_selection import GridSearchCV
import tqdm
import warnings
warnings.simplefilter('ignore')

In [17]:
# models = {'KNeighborsClassifier': KNeighborsClassifier(),
#           'DecisionTreeClassifier': DecisionTreeClassifier(),
#           'LinearSVC': LinearSVC(),
#           'RandomForestClassifier': RandomForestClassifier(), 
#           'GradientBoostingClassifier': GradientBoostingClassifier()}
# parameters_grid = {'KNeighborsClassifier' : {'n_neighbors': [1,3,5,7,10], \
#                                              'weights':['uniform', 'distance'], \
#                                              'algorithm': ['auto', 'ball_tree'], \
#                                              'leaf_size': [10, 30], \
#                                              'metric': ['minkowski', 'euclidean', 'cityblock']}, 
#                     'DecisionTreeClassifier' : {'criterion': ['gini', 'entropy'], \
#                                                 'splitter':['best', 'random'], \
#                                                 'max_depth': [None, 3], \
#                                                 'min_samples_leaf': np.arange(1, 3)},
#                    'LinearSVC' : {'loss':['hinge','squared_hinge'], \
#                                    'C': [0.1, 1.0, 10], \
#                                    'max_iter': [100, 400]},
#                     'RandomForestClassifier' : {'criterion':['gini','entropy'], \
#                                                 'n_estimators': [100, 200], \
#                                                 'max_depth': [1, 5, 10]},
#                     'GradientBoostingClassifier' : {'loss' : ['deviance', 'exponential'], \
#                                                     'learning_rate': [0.1, 0.5], \
#                                                     'max_depth': [3,5,7]}}

In [18]:
models = {'KNeighborsClassifier': KNeighborsClassifier(),
          'DecisionTreeClassifier': DecisionTreeClassifier()}

In [19]:
parameters_grid = {'KNeighborsClassifier' : {'n_neighbors': [1,3,5,7,10], \
                                             'weights':['uniform', 'distance'], \
                                             'algorithm': ['auto', 'ball_tree'], \
                                             'leaf_size': [10, 30], \
                                             'metric': ['minkowski', 'euclidean', 'cityblock']}, 
                    'DecisionTreeClassifier' : {'criterion': ['gini', 'entropy'], \
                                                'splitter':['best', 'random'], \
                                                'max_depth': [None, 3], \
                                                'min_samples_leaf': [1,2,3]}}

In [21]:
%%time

for key, model in tqdm.tqdm_notebook(models.items()):
    grid_cv = GridSearchCV(model, parameters_grid[key], n_jobs=-1)
    grid_cv.fit(X_train, y_train)
# Оценим самый лучший классификатор. Возвращается модель с лучшими параметрами.
    print(grid_cv.best_estimator_)
# Оценка на лучшем наборе параметров.
    print(grid_cv.best_score_)
# Лучший набор параметров в виде dict.
    print(grid_cv.best_params_)

A Jupyter Widget








Exception in thread Thread-4:
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Applications/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Applications/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='cityblock',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='distance')
0.744926382809
{'algorithm': 'auto', 'leaf_size': 10, 'metric': 'cityblock', 'n_neighbors': 10, 'weights': 'distance'}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.759649820931
{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'splitter': 'best'}

CPU times: user 26.1 s, sys: 485 ms, total: 26.5 s
Wall time: 22min 28s


In [22]:
from sklearn.model_selection import RandomizedSearchCV

In [23]:
%%time

for key, model in tqdm.tqdm_notebook(models.items()):
    randomized_grid = RandomizedSearchCV(model, parameters_grid[key])
    randomized_grid.fit(X_train, y_train)
    print(randomized_grid.best_score_)
    print(randomized_grid.best_params_)

A Jupyter Widget

0.736569836848
{'weights': 'distance', 'n_neighbors': 5, 'metric': 'cityblock', 'leaf_size': 30, 'algorithm': 'ball_tree'}
0.758853959411
{'splitter': 'best', 'min_samples_leaf': 1, 'max_depth': 3, 'criterion': 'entropy'}

CPU times: user 5min 3s, sys: 1.44 s, total: 5min 5s
Wall time: 5min 6s


Анализ происходит на большом датасете, параметров много. 
GridSearchCV осуществляет полный перебор по сетке и занимает много времени.
RandomizedSearchCV осуществляет случайный поиск по сетке и выдает худшее качество модели в сравнении с GridSearchCV.
Таким образом, RandomizedSearchCV лучше использовать при большом количестве параметров из-за времени.
GridSearchCV проводит более детальный и качественный анализ.