forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
grid_search_text_feature_extraction.py
126 lines (105 loc) · 4.01 KB
/
grid_search_text_feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
==========================================================
Sample pipeline for text feature extraction and evaluation
==========================================================
The dataset used in this example is the 20 newsgroups dataset which will be
automatically downloaded and then cached and reused for the document
classification example.
You can adjust the number of categories by giving there name to the dataset
loader or setting them to None to get the 20 of them.
Here is a sample output of a run on a quad-core machine::
Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']
1427 documents
2 categories
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),
'clf__n_iter': (10, 50, 80),
'clf__penalty': ('l2', 'elasticnet'),
'tfidf__use_idf': (True, False),
'vect__max_n': (1, 2),
'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_features': (None, 5000, 10000, 50000)}
done in 1737.030s
Best score: 0.940
Best parameters set:
clf__alpha: 9.9999999999999995e-07
clf__n_iter: 50
clf__penalty: 'elasticnet'
tfidf__use_idf: True
vect__max_n: 2
vect__max_df: 0.75
vect__max_features: 50000
"""
print __doc__
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Mathieu Blondel <mathieu@mblondel.org>
# License: Simplified BSD
from pprint import pprint
from time import time
import logging
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
###############################################################################
# Load some categories from the training set
categories = [
'alt.atheism',
'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None
print "Loading 20 newsgroups dataset for categories:"
print categories
data = fetch_20newsgroups(subset='train', categories=categories)
print "%d documents" % len(data.filenames)
print "%d categories" % len(data.target_names)
print
###############################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
])
parameters = {
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
'vect__max_df': (0.5, 0.75, 1.0),
#'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
#'tfidf__use_idf': (True, False),
#'tfidf__norm': ('l1', 'l2'),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l2', 'elasticnet'),
#'clf__n_iter': (10, 50, 80),
}
if __name__ == "__main__":
# multiprocessing requires the fork to happen in a __main__ protected
# block
# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print "Performing grid search..."
print "pipeline:", [name for name, _ in pipeline.steps]
print "parameters:"
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print "done in %0.3fs" % (time() - t0)
print
print "Best score: %0.3f" % grid_search.best_score_
print "Best parameters set:"
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print "\t%s: %r" % (param_name, best_parameters[param_name])