In [1]:
from __future__ import print_function

import os

from pprint import pprint
from time import time
import logging

from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
def load_movies(path="../data/raw/"):
    corpus = []
    classes = []
    label = 0
    for cl in os.listdir(path):  # parcours des fichiers d'un répertoire
        for f in os.listdir(path + cl):
            txt = open(path + cl + "/" + f).read()
            corpus.append(txt)
            classes.append(label)
        label += 1  # changer de répertoire <=> changement de classe
    return corpus, classes


In [12]:
X, y = load_movies()

In [29]:
# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline(
    [
        (
            "vect",
            CountVectorizer(
                stop_words=stopwords.words("english"),
                max_df=0.75,
                lowercase=False,
                strip_accents=None,
                ngram_range=(1, 2),
                max_features=20000,
                binary=True,
            ),
        ),
        (
            "tfidf",
            TfidfTransformer(
                norm="l2", smooth_idf=True, sublinear_tf=False, use_idf=False
            ),
        ),
        ("nb", MultinomialNB(alpha=0.6, fit_prior=True)),
    ]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    # "vect__min_df": (0.05, 0.15, 1),
    # "vect__max_df": (0.5, 0.75, 1.0),
    # "vect__max_features": (None, 20000, 50000, 200000, 500000),
    # "vect__lowercase": (False, True),
    # "vect__strip_accents": (None, "unicode"),
    # "vect__stop_words": (None, stopwords.words("english")),
    # "vect__ngram_range": ((1, 1),(1, 2),(2, 2),(2, 3),(3, 3),),  # unigrams or bigrams
    # "vect__binary": (True, False),
    # "tfidf__use_idf": (True, False),
    # "tfidf__norm": (None, "l1", "l2"),
    # "tfidf__smooth_idf": (True, False),
    # "tfidf__sublinear_tf": (False, True),
    # "nb__alpha": (0.55, 0.6, 0.65, 0.7, 0.75),
    # "nb__fit_prior": (True, False),
}


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(
        pipeline, parameters, n_jobs=-1, verbose=1, scoring="roc_auc"
    )

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['vect', 'tfidf', 'nb']
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
done in 5.641s

Best score: 0.929
Best parameters set:


Raw BoW: best results:
alpha=0.25, fit_prior=False

In [28]:
# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline(
    [
        (
            "vect",
            CountVectorizer(
                stop_words=stopwords.words("english"),
                max_df=0.5,
                lowercase=False,
                ngram_range=(1, 2),
                strip_accents=None,
                max_features=200000,
                binary=True,
            ),
        ),
        (
            "tfidf",
            TfidfTransformer(
                norm="l2", smooth_idf=True, sublinear_tf=False, use_idf=True
            ),
        ),
        ("nb", MultinomialNB(alpha=0.65, fit_prior=True)),
    ]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    # "vect__min_df": (0.05, 0.15, 1),
    # "vect__max_df": (0.5, 0.75, 1.0),
    # "vect__max_features": (None, 20000, 50000, 200000, 500000),
    # "vect__lowercase": (False, True),
    # "vect__strip_accents": (None, "unicode"),
    # "vect__stop_words": (None, stopwords.words("english")),
    # "vect__ngram_range": ((1, 1),(1, 2),(2, 2),(2, 3),(3, 3),),  # unigrams or bigrams
    # "vect__binary": (True, False),
    # "tfidf__use_idf": (True, False),
    # "tfidf__norm": (None, "l1", "l2"),
    # "tfidf__smooth_idf": (True, False),
    # "tfidf__sublinear_tf": (False, True),
    # "nb__alpha": (0.55, 0.6, 0.65, 0.7, 0.75),
    # "nb__fit_prior": (True, False),
}


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(
        pipeline, parameters, n_jobs=-1, verbose=1, scoring="roc_auc"
    )

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['vect', 'tfidf', 'nb']
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
done in 5.760s

Best score: 0.930
Best parameters set:


In [38]:
# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline(
    [
        (
            "vect",
            CountVectorizer(
                stop_words=stopwords.words("english"),
                max_df=0.5,
                lowercase=False,
                ngram_range=(1, 2),
                strip_accents=None,
                max_features=200000,
                binary=True,
            ),
        ),
        (
            "tfidf",
            TfidfTransformer(
                norm="l2", smooth_idf=True, sublinear_tf=False, use_idf=True
            ),
        ),
        ("lr", LogisticRegression(max_iter=10000, solver="saga")),
    ]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    # "vect__min_df": (0.05, 0.15, 1),
    # "vect__max_df": (0.5, 0.75, 1.0),
    # "vect__max_features": (None, 20000, 50000, 200000, 500000),
    # "vect__lowercase": (False, True),
    # "vect__strip_accents": (None, "unicode"),
    # "vect__stop_words": (None, stopwords.words("english")),
    # "vect__ngram_range": ((1, 1),(1, 2),(2, 2),(2, 3),(3, 3),),  # unigrams or bigrams
    # "vect__binary": (True, False),
    # "tfidf__use_idf": (True, False),
    # "tfidf__norm": (None, "l1", "l2"),
    # "tfidf__smooth_idf": (True, False),
    # "tfidf__sublinear_tf": (False, True),
    # "nb__alpha": (0.55, 0.6, 0.65, 0.7, 0.75),
    # "nb__fit_prior": (True, False),
    # "svm__penalty": ("l1", "l2"),
    # "svm__class_weight": (None, "balanced"),
    # "lr__penalty": (.001, .01, .1, 1, 10, 100, 1000)
}


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(
        pipeline, parameters, n_jobs=-1, verbose=1, scoring="roc_auc"
    )

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['vect', 'tfidf', 'lr']
parameters:
{'lr__penalty': (None, 'l2', 'l1', 'elasticnet')}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Hyperbeast\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Hyperbeast\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\Hyperbeast\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Hyperbeast\anaconda3\lib\site-packages\sklearn\linear_model\_log

done in 18.862s

Best score: 0.940
Best parameters set:
	lr__penalty: 'l2'
