In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as st

import warnings
warnings.simplefilter('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import LinearSVC

Воспользуемся датасетом fetch_20newsgroups. В данном датасете 20 типов новостей.

Поделим датасет на тестовую и валидационную выборку:

In [3]:
two_groups_data = fetch_20newsgroups(subset='all', 
                                     remove=('headers', 'footers', 'quotes'))

x_train, x_test, y_train, y_test = train_test_split(two_groups_data.data, 
                                                    two_groups_data.target, 
                                                    test_size=0.30, random_state=18)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Для удобной работы векторизуем текст:

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
CV = CountVectorizer(max_features=70000, ngram_range=(1, 3))
x_train_vectorized = CV.fit_transform(x_train, y_train)
x_test_vectorized = CV.transform(x_test)

In [6]:
sgd = SGDClassifier(alpha=1e-20, n_jobs=-1, random_state=124)
sgd.fit(x_train_vectorized, y_train) 
accuracy_score(y_test, sgd.predict(x_test_vectorized))

0.6315882561018747

In [7]:
logit = LogisticRegression(random_state=124)
logit.fit(x_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=124, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
accuracy_score(y_test, logit.predict(x_test_vectorized))

0.6699681641315882

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
from sklearn.model_selection import cross_val_score

Чтобы не делать векторизацию и обучение раздельно, воспользуемся Pipeline.

In [12]:
pipelineLR = Pipeline([("vectorizer", CountVectorizer(min_df=3, stop_words='english', ngram_range=(1, 3))),
                         ("tfidf", TfidfTransformer()),
                         ("logit", LogisticRegression(random_state=124))])

In [13]:
pipelineLR.fit(x_train, y_train)

Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
  ...lty='l2', random_state=124, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [14]:
accuracy_score(pipelineLR.predict(x_test), y_test)

0.745136186770428

In [15]:
pipelineSGD = Pipeline([("vectorizer", CountVectorizer(min_df=3, ngram_range=(1, 2))),
                         ("tfidf", TfidfTransformer()),
                         ("classifier", SGDClassifier(alpha=1e-20, n_jobs=-1, random_state=124))])

In [16]:
pipelineSGD.fit(x_train, y_train)

Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ...y='l2', power_t=0.5, random_state=124,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [17]:
accuracy_score(pipelineSGD.predict(x_test), y_test)

0.6954368588609834

In [18]:
pipelineLR = Pipeline([("vectorizer", CountVectorizer(min_df=3, stop_words='english', ngram_range=(1, 3))),
                         ("tfidf", TfidfTransformer()),
                         ("logit", LogisticRegression(random_state=124))])

In [20]:
pipelineLR.fit(x_train, y_train)

Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if hasattr(memory, 'cachedir') and memory.cachedir is None:


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
  ...lty='l2', random_state=124, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [21]:
accuracy_score(pipelineLR.predict(x_test), y_test)

0.745136186770428