# Sklearn

## sklearn.cross_validation

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
# from sklearn import cross_validation, datasets
from sklearn import model_selection, datasets
import numpy as np
# freezing seed for reliability
np.random.seed(42)

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [5]:
print 'Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print 'Обучающая выборка:\n', train_data[:5]
print '\n'
print 'Тестовая выборка:\n', test_data[:5]

Обучающая выборка:
[[ 5.5  2.4  3.7  1. ]
 [ 6.3  2.8  5.1  1.5]
 [ 6.4  3.1  5.5  1.8]
 [ 6.6  3.   4.4  1.4]
 [ 7.2  3.6  6.1  2.5]]


Тестовая выборка:
[[ 6.1  2.8  4.7  1.2]
 [ 5.7  3.8  1.7  0.3]
 [ 7.7  2.6  6.9  2.3]
 [ 6.   2.9  4.5  1.5]
 [ 6.8  2.8  4.8  1.4]]


In [7]:
print 'Метки классов на обучающей выборке:\n', train_labels
print '\n'
print 'Метки классов на тестовой выборке:\n', test_labels

Метки классов на обучающей выборке:
[1 2 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1 2 0 1 2 0 2 2 1 1 2 1 0 1 2 0
 0 1 1 0 2 0 0 1 1 2 1 2 2 1 0 0 2 2 0 0 0 1 2 0 2 2 0 1 1 2 1 2 0 2 1 2 1
 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1 1 2 2 0 1 2 0 1 2]


Метки классов на тестовой выборке:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]


### Стратегии проведения кросс-валидации

#### KFold

In [8]:
#for train_indices, test_indices in cross_validation.KFold(10, n_folds = 5):
#    print train_indices, test_indices

kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(range(10)):
    print train_indices, test_indices

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [9]:
#многократный запуск приводит к различным разбиениям
#for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True):
#    print train_indices, test_indices

kf = model_selection.KFold(n_splits = 5, shuffle = True)
for train_indices, test_indices in kf.split(range(10)):
    print train_indices, test_indices

[0 2 3 4 6 7 8 9] [1 5]
[1 2 4 5 6 7 8 9] [0 3]
[0 1 3 4 5 6 8 9] [2 7]
[0 1 2 3 5 6 7 9] [4 8]
[0 1 2 3 4 5 7 8] [6 9]


In [10]:
#многократный запуск приводит к одному и  тому же разбиению, результат запуска детерминированный
#for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True, random_state = 1):
#    print train_indices, test_indices

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 42)
for train_indices, test_indices in kf.split(range(10)):
    print train_indices, test_indices

[0 2 3 4 5 6 7 9] [1 8]
[1 2 3 4 6 7 8 9] [0 5]
[0 1 3 4 5 6 8 9] [2 7]
[0 1 2 3 5 6 7 8] [4 9]
[0 1 2 4 5 7 8 9] [3 6]


#### StratifiedKFold

In [11]:
#from sklearn import cross_validation
#target = np.array([0] * 5 + [1] * 5)
#print target
#for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2, shuffle = True, random_state = 0):
#    print train_indices, test_indices

target = np.array([0] * 5 + [1] * 5)
print target
skf = model_selection.StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
for train_indices, test_indices in skf.split(target,target):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [12]:
target = np.array([0, 1] * 5)
#print target
#for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2,shuffle = True):
#    print train_indices, test_indices

print target
skf = model_selection.StratifiedKFold(n_splits=2, shuffle=True)
for train_indices, test_indices in skf.split(target,target):
    print train_indices, test_indices

[0 1 0 1 0 1 0 1 0 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


#### ShuffleSplit

In [13]:
#for train_indices, test_indices in cross_validation.ShuffleSplit(10, n_iter = 10, test_size = 0.2):
#    print train_indices, test_indices

ss = model_selection.ShuffleSplit(n_splits=10, test_size=0.2)
for train_indices, test_indices in ss.split(range(10)):
    print train_indices, test_indices

[1 9 7 6 3 5 4 8] [0 2]
[7 4 8 5 0 3 6 9] [1 2]
[1 6 5 2 0 4 3 8] [7 9]
[4 6 9 8 3 1 2 0] [5 7]
[7 9 2 1 5 8 3 0] [6 4]
[4 3 5 1 8 2 0 7] [9 6]
[9 7 1 5 2 8 0 4] [3 6]
[4 0 9 2 8 3 6 5] [7 1]
[0 8 2 5 1 3 4 7] [6 9]
[1 9 6 8 7 3 2 5] [4 0]


#### StratifiedShuffleSplit

In [14]:
#target = np.array([0] * 5 + [1] * 5)
#print target
#for train_indices, test_indices in cross_validation.StratifiedShuffleSplit(target, n_iter = 4, test_size = 0.2):
#    print train_indices, test_indices
    
target = np.array([0] * 5 + [1] * 5)
print target
sss = model_selection.StratifiedShuffleSplit(n_splits=4, test_size=0.2)
for train_indices, test_indices in sss.split(target,target):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[6 1 8 7 3 4 0 9] [5 2]
[7 3 9 8 5 4 0 2] [6 1]
[0 6 7 8 1 3 2 9] [4 5]
[2 1 4 6 9 7 8 0] [5 3]


#### Leave-One-Out

In [15]:
#for train_indices, test_index in cross_validation.LeaveOneOut(10):
#    print train_indices, test_index

loo = model_selection.LeaveOneOut()
for train_indices, test_index in loo.split(range(10)):
    print train_indices, test_index    

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators

# Все:)