In [1]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import numpy as np
import os
import json
import sys
import traceback


SCRIPT_DIR = os.path.dirname('./')

class Checker(object):
    def __init__(self):
        self.data = fetch_20newsgroups(
            subset='all', 
            categories=[
                'rec.autos',
                'rec.motorcycles',
                'rec.sport.baseball',
                'rec.sport.hockey'
            ], 
            remove=('headers', 'footers', 'quotes')
        )

    def check(self, params_path):
        try:
            with open(params_path, 'r') as f:
                params = json.load(f)

            pipeline = make_pipeline(
                CountVectorizer(**params['count_vectorizer_params']), 
                TfidfTransformer(**params['tfidf_transformer_params']), 
                LogisticRegression(**params['logistic_regression_params'])
            )
            score = np.mean(cross_val_score(
                pipeline, 
                self.data.data, 
                self.data.target,
                scoring='accuracy', 
                cv=3
            ))
        except:
            traceback.print_exception(*sys.exc_info())
            score = None
        
        return score


print(Checker().check(SCRIPT_DIR + '/text_classification_params_nikolaev.json'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


0.8698143102815296


In [2]:
data = fetch_20newsgroups(
    subset='all', 
    categories=[
        'rec.autos',
        'rec.motorcycles',
        'rec.sport.baseball',
        'rec.sport.hockey'
    ], 
    remove=('headers', 'footers', 'quotes')
)

In [12]:
%%time

param_grid = {
    'countvectorizer__ngram_range' : [(1, 1), (1, 2), (1, 3)],
    'countvectorizer__min_df' : [1, 2],
    'countvectorizer__stop_words': ['english'],
    'tfidftransformer__norm' : ['l2'],
    'tfidftransformer__sublinear_tf': [True],
    'tfidftransformer__smooth_idf': [0,1],
    'logisticregression__penalty': ['l2'],
    'logisticregression__C' : [15,14,13,12],
    'logisticregression__class_weight' : [None],
}

pipe = make_pipeline(CountVectorizer(),
                     TfidfTransformer(),
                     LogisticRegression())

grid_pipe = GridSearchCV(pipe, param_grid, 
                         scoring='accuracy',
                         n_jobs=-1, cv=3,
                         verbose=10).fit(data.data, data.target)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV] countvectorizer__min_df=1, countvect

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.3s


[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8642533936651584, total=   1.7s
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8763197586726998, total=   1.8s
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.4s


[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8740573152337858, total=   1.8s
[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8657616892911011, total=   1.8s
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    5.0s


[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8665158371040724, total=   1.3s
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8650075414781297, total=   1.2s
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.8s


[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8672699849170438, total=   5.4s
[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8665158371040724, total=   4.9s
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True 
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.4s


[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8907309721175584, total=   5.1s
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8665158371040724, total=   5.1s
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   25.4s


[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8733031674208145, total=   5.7s
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8702865761689291, total=   5.1s
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   35.7s


[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8710407239819005, total=   8.5s
[CV] countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8710407239819005, total=   8.5s
[CV] countvectorizer_

[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8862094951017332, total=   9.0s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True 


[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   55.6s


[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8702865761689291, total=   8.9s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=1, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8680241327300151, total=   9.1s
[CV] countvectorizer_

[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8733031674208145, total=   1.6s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8733986435568952, total=   1.4s
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  1.0min


[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8740573152337858, total=   1.8s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 1), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8733986435568952, total=   1.6s
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.1min


[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8717948717948718, total=   2.6s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8634992458521871, total=   3.2s
[CV]  countvectorizer

[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8650075414781297, total=   3.2s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True 


[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.2min


[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8634992458521871, total=   2.5s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 2), countvectorizer__stop_words=english, logisticregression__C=12, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8786737000753579, total=   2.6s
[CV] countvectorizer_

[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=15, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8702865761689291, total=   4.1s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True 
[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8786737000753579, total=   4.8s
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  1.4min


[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=0, tfidftransformer__sublinear_tf=True, score=0.8786737000753579, total=   4.0s
[CV]  countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=14, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True, score=0.8642533936651584, total=   4.5s
[CV] countvectorizer__min_df=2, countvectorizer__ngram_range=(1, 3), countvectorizer__stop_words=english, logisticregression__C=13, logisticregression__class_weight=None, logisticregression__penalty=l2, tfidftransformer__norm=l2, tfidftransformer__smooth_idf=1, tfidftransformer__sublinear_tf=True 
[CV] countvectorizer_

[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  1.5min finished


CPU times: user 3.96 s, sys: 237 ms, total: 4.2 s
Wall time: 1min 32s


In [13]:
grid_pipe.best_score_, grid_pipe.best_params_

(0.8771048002010555,
 {'countvectorizer__min_df': 1,
  'countvectorizer__ngram_range': (1, 2),
  'countvectorizer__stop_words': 'english',
  'logisticregression__C': 13,
  'logisticregression__class_weight': None,
  'logisticregression__penalty': 'l2',
  'tfidftransformer__norm': 'l2',
  'tfidftransformer__smooth_idf': 0,
  'tfidftransformer__sublinear_tf': True})