In [1]:
import re
import pandas as pd
import numpy as np
from typing import Dict, Tuple
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = ["rec.sport.baseball", "rec.autos", "sci.space", "talk.politics.guns"]
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
train = newsgroups['data']

In [3]:

def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

In [4]:
def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

### Мой вариант:
####    Классификатор №1: LogisticRegression
####    Классификатор №2: Multinomial Naive Bayes (MNB)

In [5]:
vocabVect = CountVectorizer()
vocabVect.fit(train)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 36320


In [10]:
for i in list(corpusVocab)[1:10]:
    print('{}:     \t{}'.format(i, corpusVocab[i]))

looper:     	21139
cco:     	9458
caltech:     	9116
edu:     	13479
mark:     	21794
subject:     	31556
re:     	27442
command:     	10375
loss:     	21171


In [11]:
test_features = vocabVect.transform(train)
test_features

<2330x36320 sparse matrix of type '<class 'numpy.int64'>'
	with 373978 stored elements in Compressed Sparse Row format>

In [12]:
test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
# Размер нулевой строки
len(test_features.todense()[0].getA1())

36320

In [14]:
# Непустые значения нулевой строки
[i for i in test_features.todense()[0].getA1() if i>0]
print()




In [15]:
vocabVect.get_feature_names()[10000:10020]

['clarke',
 'clarkson',
 'clarku',
 'clas',
 'clash',
 'class',
 'classes',
 'classic',
 'classical',
 'classification',
 'classified',
 'classify',
 'classroom',
 'claudio',
 'clause',
 'clauses',
 'claw',
 'clay',
 'clayco',
 'claypigeon']

In [16]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, newsgroups['data'], newsgroups['target'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [17]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [LogisticRegression(), MultinomialNB()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '0000': 2, '00000': 3,
                            '000000': 4, '000021': 5, '000062david42': 6,
                            '000152': 7, '00041032': 8, '0004136': 9,
                            '0004246': 10, '0004422': 11, '00044513': 12,
                            '0004847546': 13, '0005': 14, '0005111312': 15,
                            '0005111312na3em': 16, '000601': 17, '000710': 18,
                            '00090711': 19, '000mi': 20, '000miles': 21,
                            '000s': 22, '000th': 23, '001': 24, '0010': 25,
                            '0012': 26, '001211': 27, '001319': 28,
                            '001428': 29, ...})
Модель для классификации - LogisticRegression()
Accuracy = 0.9613722706506213
Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '0000': 2, '00000': 3,
                            '000000': 4, '000021': 5, '000062david42': 6,
                            '000152': 7,