forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
svmlight_format.py
481 lines (388 loc) · 18.4 KB
/
svmlight_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""This module implements a loader and dumper for the svmlight format
This format is a text-based format, with one sample per line. It does
not store zero valued features hence is suitable for sparse dataset.
The first element of each line can be used to store a target variable to
predict.
This format is used as the default format for both svmlight and the
libsvm command line programs.
"""
# Authors: Mathieu Blondel <mathieu@mblondel.org>
# Lars Buitinck
# Olivier Grisel <olivier.grisel@ensta.org>
# License: BSD 3 clause
from contextlib import closing
import io
import os.path
import numpy as np
import scipy.sparse as sp
from ._svmlight_format import _load_svmlight_file
from .. import __version__
from ..externals import six
from ..externals.six import u, b
from ..externals.six.moves import range, zip
from ..utils import check_array
def load_svmlight_file(f, n_features=None, dtype=np.float64,
multilabel=False, zero_based="auto", query_id=False,
offset=0, length=-1):
"""Load datasets in the svmlight / libsvm format into sparse CSR matrix
This format is a text-based format, with one sample per line. It does
not store zero valued features hence is suitable for sparse dataset.
The first element of each line can be used to store a target variable
to predict.
This format is used as the default format for both svmlight and the
libsvm command line programs.
Parsing a text based source can be expensive. When working on
repeatedly on the same dataset, it is recommended to wrap this
loader with joblib.Memory.cache to store a memmapped backup of the
CSR results of the first call and benefit from the near instantaneous
loading of memmapped structures for the subsequent calls.
In case the file contains a pairwise preference constraint (known
as "qid" in the svmlight format) these are ignored unless the
query_id parameter is set to True. These pairwise preference
constraints can be used to constraint the combination of samples
when using pairwise loss functions (as is the case in some
learning to rank problems) so that only pairs with the same
query_id value are considered.
This implementation is written in Cython and is reasonably fast.
However, a faster API-compatible loader is also available at:
https://github.com/mblondel/svmlight-loader
Parameters
----------
f : {str, file-like, int}
(Path to) a file to load. If a path ends in ".gz" or ".bz2", it will
be uncompressed on the fly. If an integer is passed, it is assumed to
be a file descriptor. A file-like or file descriptor will not be closed
by this function. A file-like object must be opened in binary mode.
n_features : int or None
The number of features to use. If None, it will be inferred. This
argument is useful to load several files that are subsets of a
bigger sliced dataset: each subset might not have examples of
every feature, hence the inferred shape might vary from one
slice to another.
n_features is only required if ``offset`` or ``length`` are passed a
non-default value.
dtype : numpy data type, default np.float64
Data type of dataset to be loaded. This will be the data type of the
output numpy arrays ``X`` and ``y``.
multilabel : boolean, optional, default False
Samples may have several labels each (see
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
zero_based : boolean or "auto", optional, default "auto"
Whether column indices in f are zero-based (True) or one-based
(False). If column indices are one-based, they are transformed to
zero-based to match Python/NumPy conventions.
If set to "auto", a heuristic check is applied to determine this from
the file contents. Both kinds of files occur "in the wild", but they
are unfortunately not self-identifying. Using "auto" or True should
always be safe when no ``offset`` or ``length`` is passed.
If ``offset`` or ``length`` are passed, the "auto" mode falls back
to ``zero_based=True`` to avoid having the heuristic check yield
inconsistent results on different segments of the file.
query_id : boolean, default False
If True, will return the query_id array for each file.
offset : integer, optional, default 0
Ignore the offset first bytes by seeking forward, then
discarding the following bytes up until the next new line
character.
length : integer, optional, default -1
If strictly positive, stop reading any new line of data once the
position in the file has reached the (offset + length) bytes threshold.
Returns
-------
X : scipy.sparse matrix of shape (n_samples, n_features)
y : ndarray of shape (n_samples,), or, in the multilabel a list of
tuples of length n_samples.
query_id : array of shape (n_samples,)
query_id for each sample. Only returned when query_id is set to
True.
See also
--------
load_svmlight_files: similar function for loading multiple files in this
format, enforcing the same number of features/columns on all of them.
Examples
--------
To use joblib.Memory to cache the svmlight file::
from sklearn.externals.joblib import Memory
from sklearn.datasets import load_svmlight_file
mem = Memory("./mycache")
@mem.cache
def get_data():
data = load_svmlight_file("mysvmlightfile")
return data[0], data[1]
X, y = get_data()
"""
return tuple(load_svmlight_files([f], n_features, dtype, multilabel,
zero_based, query_id, offset, length))
def _gen_open(f):
if isinstance(f, int): # file descriptor
return io.open(f, "rb", closefd=False)
elif not isinstance(f, six.string_types):
raise TypeError("expected {str, int, file-like}, got %s" % type(f))
_, ext = os.path.splitext(f)
if ext == ".gz":
import gzip
return gzip.open(f, "rb")
elif ext == ".bz2":
from bz2 import BZ2File
return BZ2File(f, "rb")
else:
return open(f, "rb")
def _open_and_load(f, dtype, multilabel, zero_based, query_id,
offset=0, length=-1):
if hasattr(f, "read"):
actual_dtype, data, ind, indptr, labels, query = \
_load_svmlight_file(f, dtype, multilabel, zero_based, query_id,
offset, length)
# XXX remove closing when Python 2.7+/3.1+ required
else:
with closing(_gen_open(f)) as f:
actual_dtype, data, ind, indptr, labels, query = \
_load_svmlight_file(f, dtype, multilabel, zero_based, query_id,
offset, length)
# convert from array.array, give data the right dtype
if not multilabel:
labels = np.frombuffer(labels, np.float64)
data = np.frombuffer(data, actual_dtype)
indices = np.frombuffer(ind, np.intc)
indptr = np.frombuffer(indptr, dtype=np.intc) # never empty
query = np.frombuffer(query, np.int64)
data = np.asarray(data, dtype=dtype) # no-op for float{32,64}
return data, indices, indptr, labels, query
def load_svmlight_files(files, n_features=None, dtype=np.float64,
multilabel=False, zero_based="auto", query_id=False,
offset=0, length=-1):
"""Load dataset from multiple files in SVMlight format
This function is equivalent to mapping load_svmlight_file over a list of
files, except that the results are concatenated into a single, flat list
and the samples vectors are constrained to all have the same number of
features.
In case the file contains a pairwise preference constraint (known
as "qid" in the svmlight format) these are ignored unless the
query_id parameter is set to True. These pairwise preference
constraints can be used to constraint the combination of samples
when using pairwise loss functions (as is the case in some
learning to rank problems) so that only pairs with the same
query_id value are considered.
Parameters
----------
files : iterable over {str, file-like, int}
(Paths of) files to load. If a path ends in ".gz" or ".bz2", it will
be uncompressed on the fly. If an integer is passed, it is assumed to
be a file descriptor. File-likes and file descriptors will not be
closed by this function. File-like objects must be opened in binary
mode.
n_features : int or None
The number of features to use. If None, it will be inferred from the
maximum column index occurring in any of the files.
This can be set to a higher value than the actual number of features
in any of the input files, but setting it to a lower value will cause
an exception to be raised.
dtype : numpy data type, default np.float64
Data type of dataset to be loaded. This will be the data type of the
output numpy arrays ``X`` and ``y``.
multilabel : boolean, optional
Samples may have several labels each (see
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
zero_based : boolean or "auto", optional
Whether column indices in f are zero-based (True) or one-based
(False). If column indices are one-based, they are transformed to
zero-based to match Python/NumPy conventions.
If set to "auto", a heuristic check is applied to determine this from
the file contents. Both kinds of files occur "in the wild", but they
are unfortunately not self-identifying. Using "auto" or True should
always be safe when no offset or length is passed.
If offset or length are passed, the "auto" mode falls back
to zero_based=True to avoid having the heuristic check yield
inconsistent results on different segments of the file.
query_id : boolean, defaults to False
If True, will return the query_id array for each file.
offset : integer, optional, default 0
Ignore the offset first bytes by seeking forward, then
discarding the following bytes up until the next new line
character.
length : integer, optional, default -1
If strictly positive, stop reading any new line of data once the
position in the file has reached the (offset + length) bytes threshold.
Returns
-------
[X1, y1, ..., Xn, yn]
where each (Xi, yi) pair is the result from load_svmlight_file(files[i]).
If query_id is set to True, this will return instead [X1, y1, q1,
..., Xn, yn, qn] where (Xi, yi, qi) is the result from
load_svmlight_file(files[i])
Notes
-----
When fitting a model to a matrix X_train and evaluating it against a
matrix X_test, it is essential that X_train and X_test have the same
number of features (X_train.shape[1] == X_test.shape[1]). This may not
be the case if you load the files individually with load_svmlight_file.
See also
--------
load_svmlight_file
"""
if (offset != 0 or length > 0) and zero_based == "auto":
# disable heuristic search to avoid getting inconsistent results on
# different segments of the file
zero_based = True
if (offset != 0 or length > 0) and n_features is None:
raise ValueError(
"n_features is required when offset or length is specified.")
r = [_open_and_load(f, dtype, multilabel, bool(zero_based), bool(query_id),
offset=offset, length=length)
for f in files]
if (zero_based is False or
zero_based == "auto" and all(len(tmp[1]) and np.min(tmp[1]) > 0
for tmp in r)):
for _, indices, _, _, _ in r:
indices -= 1
n_f = max(ind[1].max() if len(ind[1]) else 0 for ind in r) + 1
if n_features is None:
n_features = n_f
elif n_features < n_f:
raise ValueError("n_features was set to {},"
" but input file contains {} features"
.format(n_features, n_f))
result = []
for data, indices, indptr, y, query_values in r:
shape = (indptr.shape[0] - 1, n_features)
X = sp.csr_matrix((data, indices, indptr), shape)
X.sort_indices()
result += X, y
if query_id:
result.append(query_values)
return result
def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):
X_is_sp = int(hasattr(X, "tocsr"))
y_is_sp = int(hasattr(y, "tocsr"))
if X.dtype.kind == 'i':
value_pattern = u("%d:%d")
else:
value_pattern = u("%d:%.16g")
if y.dtype.kind == 'i':
label_pattern = u("%d")
else:
label_pattern = u("%.16g")
line_pattern = u("%s")
if query_id is not None:
line_pattern += u(" qid:%d")
line_pattern += u(" %s\n")
if comment:
f.write(b("# Generated by dump_svmlight_file from scikit-learn %s\n"
% __version__))
f.write(b("# Column indices are %s-based\n"
% ["zero", "one"][one_based]))
f.write(b("#\n"))
f.writelines(b("# %s\n" % line) for line in comment.splitlines())
for i in range(X.shape[0]):
if X_is_sp:
span = slice(X.indptr[i], X.indptr[i + 1])
row = zip(X.indices[span], X.data[span])
else:
nz = X[i] != 0
row = zip(np.where(nz)[0], X[i, nz])
s = " ".join(value_pattern % (j + one_based, x) for j, x in row)
if multilabel:
if y_is_sp:
nz_labels = y[i].nonzero()[1]
else:
nz_labels = np.where(y[i] != 0)[0]
labels_str = ",".join(label_pattern % j for j in nz_labels)
else:
if y_is_sp:
labels_str = label_pattern % y.data[i]
else:
labels_str = label_pattern % y[i]
if query_id is not None:
feat = (labels_str, query_id[i], s)
else:
feat = (labels_str, s)
f.write((line_pattern % feat).encode('ascii'))
def dump_svmlight_file(X, y, f, zero_based=True, comment=None, query_id=None,
multilabel=False):
"""Dump the dataset in svmlight / libsvm file format.
This format is a text-based format, with one sample per line. It does
not store zero valued features hence is suitable for sparse dataset.
The first element of each line can be used to store a target variable
to predict.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : {array-like, sparse matrix}, shape = [n_samples (, n_labels)]
Target values. Class labels must be an
integer or float, or array-like objects of integer or float for
multilabel classifications.
f : string or file-like in binary mode
If string, specifies the path that will contain the data.
If file-like, data will be written to f. f should be opened in binary
mode.
zero_based : boolean, optional
Whether column indices should be written zero-based (True) or one-based
(False).
comment : string, optional
Comment to insert at the top of the file. This should be either a
Unicode string, which will be encoded as UTF-8, or an ASCII byte
string.
If a comment is given, then it will be preceded by one that identifies
the file as having been dumped by scikit-learn. Note that not all
tools grok comments in SVMlight files.
query_id : array-like, shape = [n_samples]
Array containing pairwise preference constraints (qid in svmlight
format).
multilabel : boolean, optional
Samples may have several labels each (see
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
.. versionadded:: 0.17
parameter *multilabel* to support multilabel datasets.
"""
if comment is not None:
# Convert comment string to list of lines in UTF-8.
# If a byte string is passed, then check whether it's ASCII;
# if a user wants to get fancy, they'll have to decode themselves.
# Avoid mention of str and unicode types for Python 3.x compat.
if isinstance(comment, bytes):
comment.decode("ascii") # just for the exception
else:
comment = comment.encode("utf-8")
if six.b("\0") in comment:
raise ValueError("comment string contains NUL byte")
yval = check_array(y, accept_sparse='csr', ensure_2d=False)
if sp.issparse(yval):
if yval.shape[1] != 1 and not multilabel:
raise ValueError("expected y of shape (n_samples, 1),"
" got %r" % (yval.shape,))
else:
if yval.ndim != 1 and not multilabel:
raise ValueError("expected y of shape (n_samples,), got %r"
% (yval.shape,))
Xval = check_array(X, accept_sparse='csr')
if Xval.shape[0] != yval.shape[0]:
raise ValueError("X.shape[0] and y.shape[0] should be the same, got"
" %r and %r instead." % (Xval.shape[0], yval.shape[0]))
# We had some issues with CSR matrices with unsorted indices (e.g. #1501),
# so sort them here, but first make sure we don't modify the user's X.
# TODO We can do this cheaper; sorted_indices copies the whole matrix.
if yval is y and hasattr(yval, "sorted_indices"):
y = yval.sorted_indices()
else:
y = yval
if hasattr(y, "sort_indices"):
y.sort_indices()
if Xval is X and hasattr(Xval, "sorted_indices"):
X = Xval.sorted_indices()
else:
X = Xval
if hasattr(X, "sort_indices"):
X.sort_indices()
if query_id is not None:
query_id = np.asarray(query_id)
if query_id.shape[0] != y.shape[0]:
raise ValueError("expected query_id of shape (n_samples,), got %r"
% (query_id.shape,))
one_based = not zero_based
if hasattr(f, "write"):
_dump_svmlight(X, y, f, multilabel, one_based, comment, query_id)
else:
with open(f, "wb") as f:
_dump_svmlight(X, y, f, multilabel, one_based, comment, query_id)