# Pipeline for extract and evaluate text  

###[Banafsheh Hassani](https://www.linkedin.com/in/banafsheh-hassani-7b063a129/)

###[More Projects](https://github.com/BanafshehHassani)

#Data : sklearn.datasets.fetch_20newsgroups

* 20 newsgroups dataset is automaticaly download, catch and reuse for document classification.
* Automaticaly get 20 categories or user can giving category name to dataset for adjusting number of them.

[Reference](https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html)

In [0]:
%matplotlib inline

In [0]:
from pprint import pprint
from time import time
import logging
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [0]:
print(__doc__)

#Illustrate progress logs on stdout

In [0]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

#Load couple of categories from training set

In [0]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

#Analysis categories

In [0]:
categories = None

In [0]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

In [0]:
data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

#Create pipeline combining the text feature extractor with the classifier

In [0]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

#Try more parameters to give better exploring power

In [0]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

#Find best parameters for the feature extraction and the classifier

In [0]:
if __name__ == "__main__":

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))