In [1]:
from __future__ import print_function

import codecs
import re

from pprint import pprint
from time import time
import logging

from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
def load_speaker(path="../data/raw/corpus.tache1.learn.utf8"):
    corpus = []
    classes = []
    f = codecs.open(path, "r", "utf-8")  # pour régler le codage
    while True:
        texte = f.readline()
        if (len(texte)) < 5:
            break
        label = re.sub(r"<\d*:\d*:(.)>.*", "\\1", texte)
        texte = re.sub(r"<\d*:\d*:.>(.*)", "\\1", texte)
        if label.count("M") > 0:
            classes.append(-1)
        else:
            classes.append(1)
        corpus.append(texte)
    return corpus, classes


In [3]:
X, y = load_speaker()

In [4]:
print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

print("%d documents" % len(X))
print("%d categories" % len(y))
print()

Automatically created module for IPython interactive environment
57413 documents
57413 categories



In [43]:
# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline(
    [
        (
            "vect",
            CountVectorizer(
                max_features=100000,
                ngram_range=(1, 2),
                max_df=0.5,
                lowercase=False,
                strip_accents=None,
                binary=True
            ),
        ),
        (
            "tfidf",
            TfidfTransformer(
                use_idf=True, norm="l2", smooth_idf=True, sublinear_tf=False
            ),
        ),
        ("nb", MultinomialNB()),
    ]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    "nb__alpha": (0.28, 0.3, 0.32),
    "nb__fit_prior": (True, False),
}


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(
        pipeline, parameters, n_jobs=-1, verbose=1, scoring="f1_macro"
    )

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['vect', 'tfidf', 'nb']
parameters:
{'nb__alpha': (0, 0.28, 0.3, 0.32),
 'nb__fit_prior': (True, False),
 'nb__force_alpha': (True, False)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: Invalid parameter force_alpha for estimator MultinomialNB(alpha=0). Check the list of available parameters with `estimator.get_params().keys()`.

Raw BoW: best results:
alpha=0.25, fit_prior=False

In [39]:
# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline(
    [
        (
            "vect",
            CountVectorizer(
                max_df=0.5,
                stop_words=stopwords.words("french"),
                lowercase=False,
                strip_accents=None,
                max_features=10000,
                ngram_range=(1,2),
                binary=True),
        ),
        (
            "tfidf",
            TfidfTransformer(norm=None, use_idf=False, sublinear_tf=False,smooth_idf=False),
        ),
        ("nb", MultinomialNB()),
    ]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {    
    #"vect__min_df": (0.01, 0.05, 0.15, 0.30, 1),
    #"vect__max_df": (0.5, 0.75, 0.9, 1.0),
    #"vect__max_features": (None, 10000, 50000, 100000),
    #"vect__stop_words": (None, stopwords.words("french")),
    #"vect__ngram_range": ((1, 1), (1, 2), (2, 2), (2,3), (3,3)),  # unigrams or bigrams
    #"vect__binary": (True, False),
    #"tfidf__use_idf": (True, False),
    #"tfidf__norm": (None, "l1", "l2"),
    #"tfidf__smooth_idf": (True, False),
    #"tfidf__sublinear_tf": (False, True),
    "nb__alpha": (0, 0.25, 0.5, 0.75, 1.0, 10),
    "nb__fit_prior": (True, False),
    }


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(
        pipeline, parameters, n_jobs=-1, verbose=1, scoring="f1_macro"
    )

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['vect', 'tfidf', 'nb']
parameters:
{'nb__alpha': (0, 0.25, 0.5, 0.75, 1.0, 10), 'nb__fit_prior': (True, False)}
Fitting 5 folds for each of 12 candidates, totalling 60 fits
done in 27.678s

Best score: 0.751
Best parameters set:
	nb__alpha: 0.25
	nb__fit_prior: True
