In [5]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import numpy as np
import os
import json
import sys
import traceback


SCRIPT_DIR = os.path.dirname('./')

class Checker(object):
    def __init__(self):
        self.data = fetch_20newsgroups(
            subset='all', 
            categories=[
                'rec.autos',
                'rec.motorcycles',
                'rec.sport.baseball',
                'rec.sport.hockey'
            ], 
            remove=('headers', 'footers', 'quotes')
        )

    def check(self, params_path):
        try:
            with open(params_path, 'r') as f:
                params = json.load(f)

            pipeline = make_pipeline(
                CountVectorizer(**params['count_vectorizer_params']), 
                TfidfTransformer(**params['tfidf_transformer_params']), 
                LogisticRegression(**params['logistic_regression_params'])
            )
            score = np.mean(cross_val_score(
                pipeline, 
                self.data.data, 
                self.data.target,
                scoring='accuracy', 
                cv=3
            ))
        except:
            traceback.print_exception(*sys.exc_info())
            score = None
        
        return score


print(Checker().check(SCRIPT_DIR + '/text_classification_params_nikolaev.json'))

0.867803249447


In [6]:
data = fetch_20newsgroups(
    subset='all', 
    categories=[
        'rec.autos',
        'rec.motorcycles',
        'rec.sport.baseball',
        'rec.sport.hockey'
    ], 
    remove=('headers', 'footers', 'quotes')
)

In [None]:
%%time

param_grid = {
    'countvectorizer__ngram_range' : [(1, 1), (1, 2), (1, 3)],
    'countvectorizer__min_df' : [1, 2, 3],
    'countvectorizer__stop_words': ['english'],
    'tfidftransformer__norm' : ['l2'],
    'tfidftransformer__sublinear_tf': [True, False],
    'tfidftransformer__smooth_idf': [0,0.5,1],
    'logisticregression__penalty': ['l2'],
    'logisticregression__C' : [100,10,1],
    'logisticregression__class_weight' : [None, 'balanced'],
    'logisticregression__n_jobs' : [-1],
}

pipe = make_pipeline(CountVectorizer(),
                     TfidfTransformer(),
                     LogisticRegression())

grid_pipe = GridSearchCV(pipe, param_grid, 
                         scoring='accuracy',
                         n_jobs=-1, cv=3,
                         verbose=1).fit(data.data, data.target)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:  1.1min


In [None]:
grid_pipe.best_score_, grid_pipe.best_params_