forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bench_sgd_covertype.py
195 lines (161 loc) · 6.03 KB
/
bench_sgd_covertype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
================================
Covertype dataset with dense SGD
================================
Benchmark stochastic gradient descent (SGD), Liblinear, and Naive Bayes on
the forest covertype dataset of Blackard, Jock, and Dean [1]. The dataset
comprises 581,012 samples. It is low-dimensional with 54 features and a
sparsity of approx. 23%. Here, we consider the task of predicting class 1
(spruce/fir). The classification performance of SGD is competitive with
Liblinear while being two orders of magnitude faster to train::
[..]
Classification performance:
===========================
Classifier train-time test-time error-rate
--------------------------------------------
Liblinear 9.4471s 0.0184s 0.2305
GNB 2.5426s 0.1725s 0.3633
SGD 0.2137s 0.0047s 0.2300
The same task has been used in a number of papers including:
* `"SVM Optimization: Inverse Dependence on Training Set Size"
<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.139.2112>`_
S. Shalev-Shwartz, N. Srebro - In Proceedings of ICML '08.
* `"Pegasos: Primal estimated sub-gradient solver for svm"
<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.74.8513>`_
S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07.
* `"Training Linear SVMs in Linear Time"
<www.cs.cornell.edu/People/tj/publications/joachims_06a.pdf>`_
T. Joachims - In SIGKDD '06
[1] http://archive.ics.uci.edu/ml/datasets/Covertype
To run this example use your favorite python shell::
% ipython examples/sgd/covertype_dense_sgd.py
"""
from __future__ import division
print __doc__
# Author: Peter Prettenhoer <peter.prettenhofer@gmail.com>
# License: BSD Style.
# $Id$
from time import time
import os
import numpy as np
from scikits.learn.svm import LinearSVC
from scikits.learn.linear_model import SGDClassifier
from scikits.learn.naive_bayes import GNB
from scikits.learn import metrics
######################################################################
## Download the data, if not already on disk
if not os.path.exists('covtype.data.gz'):
# Download the data
import urllib
print "Downloading data, Please Wait (11MB)..."
opener = urllib.urlopen(
'http://archive.ics.uci.edu/ml/'
'machine-learning-databases/covtype/covtype.data.gz')
open('covtype.data.gz', 'wb').write(opener.read())
######################################################################
## Load dataset
print("Loading dataset...")
import gzip
f = gzip.open('covtype.data.gz')
X = np.fromstring(f.read().replace(",", " "), dtype=np.float64, sep=" ",
count=-1)
X = X.reshape((581012, 55))
f.close()
# class 1 vs. all others.
y = np.ones(X.shape[0]) * -1
y[np.where(X[:, -1] == 1)] = 1
X = X[:, :-1]
######################################################################
## Create train-test split (as [Joachims, 2006])
print("Creating train-test split...")
idx = np.arange(X.shape[0])
np.random.seed(13)
np.random.shuffle(idx)
train_idx = idx[:522911]
test_idx = idx[522911:]
X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]
# free memory
del X
del y
######################################################################
## Standardize first 10 features (the numerical ones)
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
mean[10:] = 0.0
std[10:] = 1.0
X_train = (X_train-mean) / std
X_test = (X_test-mean) / std
######################################################################
## Print dataset statistics
print("")
print("Dataset statistics:")
print("===================")
print("%s %d" % ("number of features:".ljust(25),
X_train.shape[1]))
print("%s %d" % ("number of classes:".ljust(25),
np.unique(y_train).shape[0]))
print("%s %d (%d, %d)" % ("number of train samples:".ljust(25),
X_train.shape[0], np.sum(y_train==1),
np.sum(y_train==-1)))
print("%s %d (%d, %d)" % ("number of test samples:".ljust(25),
X_test.shape[0], np.sum(y_test==1),
np.sum(y_test==-1)))
print("")
print("Training classifiers...")
print("")
######################################################################
## Benchmark classifiers
def benchmark(clf):
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
err = metrics.zero_one(y_test, pred) / float(pred.shape[0])
return err, train_time, test_time
######################################################################
## Train Liblinear model
liblinear_parameters = {
'loss': 'l2',
'penalty': 'l2',
'C': 1000,
'dual': False,
'tol': 1e-3,
}
liblinear_res = benchmark(LinearSVC(**liblinear_parameters))
liblinear_err, liblinear_train_time, liblinear_test_time = liblinear_res
######################################################################
## Train GNB model
gnb_err, gnb_train_time, gnb_test_time = benchmark(GNB())
######################################################################
## Train SGD model
sgd_parameters = {
'alpha': 0.001,
'n_iter': 2,
}
sgd_err, sgd_train_time, sgd_test_time = benchmark(SGDClassifier(
**sgd_parameters))
######################################################################
## Print classification performance
print("")
print("Classification performance:")
print("===========================")
print("")
def print_row(clf_type, train_time, test_time, err):
print("%s %s %s %s" % (clf_type.ljust(12),
("%.4fs" % train_time).center(10),
("%.4fs" % test_time).center(10),
("%.4f" % err).center(10)))
print("%s %s %s %s" % ("Classifier ", "train-time", "test-time",
"error-rate"))
print("-" * 44)
print_row("Liblinear", liblinear_train_time, liblinear_test_time,
liblinear_err)
print_row("GNB", gnb_train_time, gnb_test_time, gnb_err)
print_row("SGD", sgd_train_time, sgd_test_time, sgd_err)
print("")
print("")