In [1]:
%matplotlib inline


# Sample pipeline for text feature extraction and evaluation

The dataset used in this example is the 20 newsgroups dataset which will be
automatically downloaded and then cached and reused for the document
classification example.

You can adjust the number of categories by giving their names to the dataset
loader or setting them to None to get the 20 of them.

Here is a sample output of a run on a quad-core machine::

  Loading 20 newsgroups dataset for categories:
  ['alt.atheism', 'talk.religion.misc']
  1427 documents
  2 categories

  Performing grid search...
  pipeline: ['vect', 'tfidf', 'clf']
  parameters:
  {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),
   'clf__max_iter': (10, 50, 80),
   'clf__penalty': ('l2', 'elasticnet'),
   'tfidf__use_idf': (True, False),
   'vect__max_n': (1, 2),
   'vect__max_df': (0.5, 0.75, 1.0),
   'vect__max_features': (None, 5000, 10000, 50000)}
  done in 1737.030s

  Best score: 0.940
  Best parameters set:
      clf__alpha: 9.9999999999999995e-07
      clf__max_iter: 50
      clf__penalty: 'elasticnet'
      tfidf__use_idf: True
      vect__max_n: 2
      vect__max_df: 0.75
      vect__max_features: 50000


In [9]:
# https://stackoverflow.com/questions/50285973/pipeline-multiple-classifiers
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):

    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [12]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause
import pandas as pd
from pprint import pprint
from time import time
import logging

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
# https://stackoverflow.com/questions/20186344/importing-an-ipynb-file-from-another-ipynb-file
from ipynb.fs.full.feature_analysis import *

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

train_data = pd.read_csv("../datasets/Train.csv", sep=',')
train_tweets = train_data[['text']].values[:, 0]
train_sentiments = train_data[['sentiment']].values[:, 0]
test_data = pd.read_csv("../datasets/Test.csv", sep=',')

# #############################################################################
# Load some categories from the training set
categories = train_sentiments

print("Categories:")
print(set(categories))

# data = fetch_20newsgroups(subset="train", categories=categories)
data = train_tweets
print(type(data))
print("%d documents" % len(data))
print("%d categories" % len(set(categories)))
print()

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", ClfSwitcher()),
    ]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = [
    {
        "vect__max_df": (0.5, 0.75, 1.0),
        "vect__ngram_range": ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
        "clf__estimator": [SGDClassifier()],
        "clf__estimator__max_iter": (20,),
        "clf__estimator__alpha": (0.00001, 0.000001),
        "clf__estimator__penalty": ("l2", "elasticnet"),
    },
    {
        "vect__max_df": (0.5, 0.75, 1.0),
        "vect__ngram_range": ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
        'clf__estimator': [GaussianNB()],
        "clf__estimator__alpha": (0.00001, 0.000001),
    },
    {
        "vect__max_df": (0.5, 0.75, 1.0),
        "vect__ngram_range": ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
        'clf__estimator': [MultinomialNB()],
        "clf__estimator__alpha": (0.00001, 0.000001),
    },
    {
        "vect__max_df": (0.5, 0.75, 1.0),
        "vect__ngram_range": ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
        'clf__estimator': [BernoulliNB()],
        "clf__estimator__alpha": (0.00001, 0.000001),
    },
]

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data, categories)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    pprint(best_parameters)

Categories:
{'positive', 'neutral', 'negative'}
<class 'numpy.ndarray'>
21802 documents
3 categories

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
[{'clf__estimator': [SGDClassifier()],
  'clf__estimator__alpha': (1e-05, 1e-06),
  'clf__estimator__max_iter': (20,),
  'clf__estimator__penalty': ('l2', 'elasticnet'),
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2), (1, 3))},
 {'clf__estimator': [GaussianNB()],
  'clf__estimator__alpha': (1e-05, 1e-06),
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2), (1, 3))},
 {'clf__estimator': [MultinomialNB()],
  'clf__estimator__alpha': (1e-05, 1e-06),
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2), (1, 3))},
 {'clf__estimator': [BernoulliNB()],
  'clf__estimator__alpha': (1e-05, 1e-06),
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2), (1, 3))}]
Fitting 5 folds for each of 90 candidates, totalling 450 fits
