In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

X_train = train_news.data
y_train = train_news.target

In [3]:
test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

X_test = test_news.data
y_test = test_news.target

In [4]:
print(len(X_train), len(X_test))

11314 7532


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [6]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=700)),
    ('lr_clf', LogisticRegression())
])

params = {
    # 'tfidf_vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    # 'tfidf_vect__max_df': [100, 300, 700],
    'lr_clf__C': [1, 5, 10]
}

In [7]:
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(X_train, y_train)
print(grid_cv_pipe.best_params_, grid_cv_pipe.best_score_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 37.3min finished
{'lr_clf__C': 10} 0.7548175838024161


In [10]:
pred = grid_cv_pipe.predict(X_test)
print('accuracy:', accuracy_score(y_test, pred))

accuracy: 0.7023366967604886
