forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mlcomp.py
111 lines (88 loc) · 3.99 KB
/
mlcomp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Copyright (c) 2010 Olivier Grisel <olivier.grisel@ensta.org>
# License: BSD 3 clause
"""Glue code to load http://mlcomp.org data as a scikit.learn dataset"""
import os
import numbers
from sklearn.datasets.base import load_files
from sklearn.utils import deprecated
def _load_document_classification(dataset_path, metadata, set_=None, **kwargs):
if set_ is not None:
dataset_path = os.path.join(dataset_path, set_)
return load_files(dataset_path, metadata.get('description'), **kwargs)
LOADERS = {
'DocumentClassification': _load_document_classification,
# TODO: implement the remaining domain formats
}
@deprecated("since the http://mlcomp.org/ website will shut down "
"in March 2017, the load_mlcomp function was deprecated "
"in version 0.19 and will be removed in 0.21.")
def load_mlcomp(name_or_id, set_="raw", mlcomp_root=None, **kwargs):
"""Load a datasets as downloaded from http://mlcomp.org
Parameters
----------
name_or_id : the integer id or the string name metadata of the MLComp
dataset to load
set_ : select the portion to load: 'train', 'test' or 'raw'
mlcomp_root : the filesystem path to the root folder where MLComp datasets
are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
environment variable is looked up instead.
**kwargs : domain specific kwargs to be passed to the dataset loader.
Read more in the :ref:`User Guide <datasets>`.
Returns
-------
data : Bunch
Dictionary-like object, the interesting attributes are:
'filenames', the files holding the raw to learn, 'target', the
classification labels (integer index), 'target_names',
the meaning of the labels, and 'DESCR', the full description of the
dataset.
Note on the lookup process: depending on the type of name_or_id,
will choose between integer id lookup or metadata name lookup by
looking at the unzipped archives and metadata file.
TODO: implement zip dataset loading too
"""
if mlcomp_root is None:
try:
mlcomp_root = os.environ['MLCOMP_DATASETS_HOME']
except KeyError:
raise ValueError("MLCOMP_DATASETS_HOME env variable is undefined")
mlcomp_root = os.path.expanduser(mlcomp_root)
mlcomp_root = os.path.abspath(mlcomp_root)
mlcomp_root = os.path.normpath(mlcomp_root)
if not os.path.exists(mlcomp_root):
raise ValueError("Could not find folder: " + mlcomp_root)
# dataset lookup
if isinstance(name_or_id, numbers.Integral):
# id lookup
dataset_path = os.path.join(mlcomp_root, str(name_or_id))
else:
# assume name based lookup
dataset_path = None
expected_name_line = "name: " + name_or_id
for dataset in os.listdir(mlcomp_root):
metadata_file = os.path.join(mlcomp_root, dataset, 'metadata')
if not os.path.exists(metadata_file):
continue
with open(metadata_file) as f:
for line in f:
if line.strip() == expected_name_line:
dataset_path = os.path.join(mlcomp_root, dataset)
break
if dataset_path is None:
raise ValueError("Could not find dataset with metadata line: " +
expected_name_line)
# loading the dataset metadata
metadata = dict()
metadata_file = os.path.join(dataset_path, 'metadata')
if not os.path.exists(metadata_file):
raise ValueError(dataset_path + ' is not a valid MLComp dataset')
with open(metadata_file) as f:
for line in f:
if ":" in line:
key, value = line.split(":", 1)
metadata[key.strip()] = value.strip()
format = metadata.get('format', 'unknow')
loader = LOADERS.get(format)
if loader is None:
raise ValueError("No loader implemented for format: " + format)
return loader(dataset_path, metadata, set_=set_, **kwargs)