# Sklearn

## sklearn.cross_validation

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import datasets, model_selection
import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, test_size=0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [5]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data), len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print('Обучающая выборка:\n', train_data[:5])
print('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[5.  3.4 1.6 0.4]
 [6.5 3.2 5.1 2. ]
 [6.  2.9 4.5 1.5]
 [5.1 3.8 1.6 0.2]
 [7.7 2.6 6.9 2.3]]
Тестовая выборка:
 [[5.9 3.  4.2 1.5]
 [4.8 3.  1.4 0.1]
 [5.8 2.7 5.1 1.9]
 [5.4 3.9 1.3 0.4]
 [6.8 3.  5.5 2.1]]


In [7]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [0 2 1 0 2 0 1 2 0 1 2 0 0 1 1 0 0 1 2 0 1 1 0 1 0 1 2 0 1 2 0 0 0 2 1 1 0
 0 1 2 2 1 2 0 1 2 1 1 0 0 0 1 0 2 0 0 0 1 2 1 0 1 2 1 2 2 2 2 0 2 0 1 2 1
 2 2 2 2 2 1 0 2 0 0 0 2 0 2 2 0 1 0 2 1 0 2 2 0 1 2 0 1 0 0 1]
Метки классов на тестовой выборке:
 [1 0 2 0 2 1 2 1 0 2 1 1 2 2 2 1 2 1 0 1 1 1 0 1 1 0 1 1 0 0 1 2 1 0 2 1 2
 0 2 1 2 1 2 2 2]


### Стратегии проведения кросс-валидации

#### KFold

In [8]:
kf = model_selection.KFold(n_splits=5)

for train_indices, test_indices in kf.split(np.arange(10)):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [9]:
kf = model_selection.KFold(n_splits=2)

for train_indices, test_indices in kf.split(np.arange(10)):
    print(train_indices, test_indices)

[5 6 7 8 9] [0 1 2 3 4]
[0 1 2 3 4] [5 6 7 8 9]


In [10]:
kf = model_selection.KFold(n_splits=5, shuffle=True)

for train_indices, test_indices in kf.split(np.arange(10)):
    print(train_indices, test_indices)

[0 1 2 5 6 7 8 9] [3 4]
[0 2 3 4 5 7 8 9] [1 6]
[0 1 2 3 4 5 6 8] [7 9]
[0 1 3 4 6 7 8 9] [2 5]
[1 2 3 4 5 6 7 9] [0 8]


In [11]:
kf = model_selection.KFold(n_splits=2, shuffle=True, random_state=1)

for train_indices, test_indices in kf.split(np.arange(10)):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [12]:
target = np.array([0] * 4 + [1] * 4)
print(target)
skf = model_selection.StratifiedKFold(n_splits=2)
for train_indices, test_indices in skf.split(np.arange(8), target):
    print(train_indices, test_indices)

[0 0 0 0 1 1 1 1]
[2 3 6 7] [0 1 4 5]
[0 1 4 5] [2 3 6 7]


In [13]:
target = np.array([0, 1] * 4)
print(target)
skf = model_selection.StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
for train_indices, test_indices in skf.split(np.arange(8), target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1]
[0 2 3 7] [1 4 5 6]
[1 4 5 6] [0 2 3 7]


#### ShuffleSplit

In [14]:
ss = model_selection.ShuffleSplit(n_splits=10,  test_size=0.2, random_state=0)
for train_indices, test_indices in ss.split(np.arange(10)):
    print(train_indices, test_indices)

[4 9 1 6 7 3 0 5] [2 8]
[1 2 9 8 0 6 7 4] [3 5]
[8 4 5 1 0 6 9 7] [2 3]
[9 2 7 5 8 0 3 4] [6 1]
[7 4 1 0 6 8 9 3] [5 2]
[9 5 2 6 3 7 4 0] [1 8]
[1 7 8 6 5 4 9 0] [3 2]
[2 3 5 6 7 1 8 4] [0 9]
[9 1 4 5 3 6 0 2] [7 8]
[5 0 8 3 7 9 6 4] [1 2]


#### StratifiedShuffleSplit

In [15]:
target = np.array([0, 1] * 4)
print(target)
sss = model_selection.StratifiedShuffleSplit(n_splits=10,  test_size=0.2, random_state=0)
for train_indices, test_indices in sss.split(np.arange(8), target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1]
[6 1 5 4 2 3] [7 0]
[4 3 6 1 7 2] [5 0]
[7 2 5 1 0 4] [6 3]
[3 5 4 7 0 6] [2 1]
[3 4 2 7 1 0] [5 6]
[3 2 4 5 1 6] [0 7]
[4 3 0 5 2 1] [6 7]
[0 6 4 7 1 3] [5 2]
[3 1 7 4 2 6] [0 5]
[5 3 6 4 2 7] [1 0]


#### Leave-One-Out

In [16]:
loo = model_selection.LeaveOneOut()
for train_indices, test_indices in loo.split(np.arange(8)):
    print(train_indices, test_indices)

[1 2 3 4 5 6 7] [0]
[0 2 3 4 5 6 7] [1]
[0 1 3 4 5 6 7] [2]
[0 1 2 4 5 6 7] [3]
[0 1 2 3 5 6 7] [4]
[0 1 2 3 4 6 7] [5]
[0 1 2 3 4 5 7] [6]
[0 1 2 3 4 5 6] [7]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators