/
nested_cv.py
executable file
·145 lines (123 loc) · 5.08 KB
/
nested_cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the PyMVPA package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""
Nested Cross-Validation
=======================
.. index:: model selection, cross-validation
Often it is desired to explore multiple models (classifiers,
parameterizations) but it becomes an easy trap for introducing an
optimistic bias into generalization estimate. The easiest but
computationally intensive solution to overcome such a bias is to carry
model selection by estimating the same (or different) performance
characteristic while operating only on training data. If such
performance is a cross-validation, then it leads to the so called
"nested cross-validation" procedure.
This example will demonstrate on how to implement such nested
cross-validation while selecting the best performing classifier from
the warehouse of available within PyMVPA.
"""
from mvpa2.suite import *
# increase verbosity a bit for now
verbose.level = 3
# pre-seed RNG if you want to investigate the effects, thus
# needing reproducible results
#mvpa2.seed(3)
"""
For this simple example lets generate some fresh random data with 2
relevant features and low SNR.
"""
dataset = normal_feature_dataset(perlabel=24, nlabels=2, nchunks=3,
nonbogus_features=[0, 1],
nfeatures=100, snr=3.0)
"""
For the demonstration of model selection benefit, lets first compute
cross-validated error using simple and popular kNN.
"""
clf_sample = kNN()
cv_sample = CrossValidation(clf_sample, NFoldPartitioner())
verbose(1, "Estimating error using a sample classifier")
error_sample = np.mean(cv_sample(dataset))
"""
For the convenience lets define a helpful function which we will use
twice -- once within cross-validation, and once on the whole dataset
"""
def select_best_clf(dataset_, clfs):
"""Select best model according to CVTE
Helper function which we will use twice -- once for proper nested
cross-validation, and once to see how big an optimistic bias due
to model selection could be if we simply provide an entire dataset.
Parameters
----------
dataset_ : Dataset
clfs : list of Classifiers
Which classifiers to explore
Returns
-------
best_clf, best_error
"""
best_error = None
for clf in clfs:
cv = CrossValidation(clf, NFoldPartitioner())
# unfortunately we don't have ability to reassign clf atm
# cv.transerror.clf = clf
try:
error = np.mean(cv(dataset_))
except LearnerError, e:
# skip the classifier if data was not appropriate and it
# failed to learn/predict at all
continue
if best_error is None or error < best_error:
best_clf = clf
best_error = error
verbose(4, "Classifier %s cv error=%.2f" % (clf.descr, error))
verbose(3, "Selected the best out of %i classifiers %s with error %.2f"
% (len(clfs), best_clf.descr, best_error))
return best_clf, best_error
"""
First lets select a classifier within cross-validation, thus
eliminating model-selection bias
"""
best_clfs = {}
confusion = ConfusionMatrix()
verbose(1, "Estimating error using nested CV for model selection")
partitioner = NFoldPartitioner()
splitter = Splitter('partitions')
for isplit, partitions in enumerate(partitioner.generate(dataset)):
verbose(2, "Processing split #%i" % isplit)
dstrain, dstest = list(splitter.generate(partitions))
best_clf, best_error = select_best_clf(dstrain, clfswh['!gnpp'])
best_clfs[best_clf.descr] = best_clfs.get(best_clf.descr, 0) + 1
# now that we have the best classifier, lets assess its transfer
# to the testing dataset while training on entire training
tm = TransferMeasure(best_clf, splitter,
postproc=BinaryFxNode(mean_mismatch_error,
space='targets'),
enable_ca=['stats'])
tm(partitions)
confusion += tm.ca.stats
"""
And for comparison, lets assess what would be the best performance if
we simply explore all available classifiers, providing all the data at
once
"""
verbose(1, "Estimating error via fishing expedition (best clf on entire dataset)")
cheating_clf, cheating_error = select_best_clf(dataset, clfswh['!gnpp'])
print """Errors:
sample classifier (kNN): %.2f
model selection within cross-validation: %.2f
model selection via fishing expedition: %.2f with %s
""" % (error_sample, 1 - confusion.stats['ACC'],
cheating_error, cheating_clf.descr)
print "# of times following classifiers were selected within " \
"nested cross-validation:"
for c, count in sorted(best_clfs.items(), key=lambda x:x[1], reverse=True):
print " %i times %s" % (count, c)
print "\nConfusion table for the nested cross-validation results:"
print confusion