# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [5]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print('Обучающая выборка:\n', train_data[:5])
print('\n')
print('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[ 5.2  4.1  1.5  0.1]
 [ 5.4  3.4  1.5  0.4]
 [ 5.   2.   3.5  1. ]
 [ 6.7  3.3  5.7  2.5]
 [ 5.8  4.   1.2  0.2]]


Тестовая выборка:
 [[ 7.4  2.8  6.1  1.9]
 [ 5.7  3.8  1.7  0.3]
 [ 6.   2.9  4.5  1.5]
 [ 7.9  3.8  6.4  2. ]
 [ 5.1  2.5  3.   1.1]]


In [7]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('\n')
print('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [0 0 1 2 0 1 0 1 0 1 0 0 0 2 0 1 0 1 0 0 0 1 2 2 1 2 2 2 1 0 1 0 2 0 1 1 0
 0 2 1 0 0 2 0 1 0 2 0 0 2 2 1 0 0 1 1 1 1 1 2 1 2 1 2 2 1 1 0 2 0 2 1 1 2
 2 2 1 0 2 1 1 2 2 1 1 1 2 2 2 2 2 1 2 2 1 0 0 0 0 1 0 2 0 2 2]


Метки классов на тестовой выборке:
 [2 0 1 2 1 2 1 1 2 0 1 0 2 0 1 1 1 0 2 1 2 1 2 2 0 2 1 1 0 1 1 1 2 2 2 0 2
 0 0 0 0 0 2 0 0]


### Стратегии проведения кросс-валидации

In [10]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,10)

#### KFold

In [12]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [22]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[4 5 6 7 9] [0 1 2 3 8]
[0 1 2 3 8] [4 5 6 7 9]


In [23]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [33]:
y = np.array([0] * 5 + [1] * 5)
print(y)

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [28]:
target = np.array([0, 1] * 5)
print(target)

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[2 3 7 8] [0 1 4 5 6 9]
[0 1 4 5 6 9] [2 3 7 8]


#### ShuffleSplit

In [34]:
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 0.2)

for train_indices, test_indices in ss.split(X):
    print(train_indices, test_indices)

[8 1 6 5 0 3 4 9] [2 7]
[1 2 8 3 6 5 7 0] [4 9]
[4 2 1 7 3 5 9 8] [0 6]
[9 5 8 3 1 0 4 6] [7 2]
[4 7 9 0 3 8 1 5] [2 6]
[5 7 8 1 6 3 0 4] [9 2]
[7 8 4 5 6 3 0 9] [2 1]
[0 6 9 7 5 2 4 3] [1 8]
[5 6 2 0 8 1 9 7] [4 3]
[0 1 3 2 4 9 7 8] [6 5]


#### StratifiedShuffleSplit

In [36]:
target = np.array([0] * 5 + [1] * 5)
print(target)

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[7 8 3 6 2 1 4 9] [0 5]
[9 1 4 3 6 7 5 2] [8 0]
[6 0 5 1 8 4 2 7] [9 3]
[7 5 8 4 3 9 0 1] [2 6]


#### Leave-One-Out

In [37]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators

In [2]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,50)

In [9]:
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 10)

for train_indices, test_indices in ss.split(X):
    print(train_indices, test_indices)

[24  4  1 12 43  0 38 14 33 26  8 23 15 17  5 11 29 44 22  3 39 10 31 13 19
  9 34 28 46 37 48 40 27 47  6 25 49 36 32 41] [18 21  2 45 30 42 20 35 16  7]
[ 1 26 49  9 48 20 34 29 32  7 35 27 33 47  0  4 46 28  2 25 12 23 22 10 39
 45 31  5 18 43 19 21 42 15  3 38 37 40 36 24] [17 44 41  8  6 13 16 30 11 14]
[ 1 30  4 26 11 41 45 25 27 14 16 42 22 31  0  6  9 38 28 12 35 43 29 48 34
 32 39 10 20 40  8 47 21 13 23 44 33  5 17 19] [ 2 46 24 37 15  3 49 18 36  7]
[24 23  0 37 12 25  4 38 15 46 31 26 40 44 33 27 21 42 39 43 22 45 35  9 30
 16 20 19 49  3 28 11 14  8  6  1  2 17 29 34] [41 10  7 47 18 13 32 48  5 36]
[38 16 37 48 45 14  0  4 32 10 30 43 44 28 24 27  3  8 31 18  6 17 41 33 34
  9 13 39 25 42 40  2  5 22 35 49 15 19  7 47] [11 23 36 46 20  1 29 26 21 12]
[27 29 41 14 30 48  1 47 31 44 11 20 22  5 28  7 25 32 18  6 15  9 24 36 42
 23 19 37  0  8 49 16 38 12 33 43 34 10  3 39] [45 35 13 46 26 40  4 17  2 21]
[36 33  8 45 34 23  7 46 24 42 25 47 30  4 35 26 13 31 17 40  0  3 22 

In [10]:
train_indices.shape

(40,)

In [5]:
ss

ShuffleSplit(n_splits=10, random_state=None, test_size=0.2, train_size=None)