In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import *
from nltk import word_tokenize
import itertools

## Выгрузка данных из датасета

In [2]:
categories = ['comp.os.ms-windows.misc', 'comp.sys.mac.hardware', 'sci.space']
remove = ['headers', 'footers', 'quotes']
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=categories, remove=remove)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=categories, remove=remove)

In [3]:
twenty_train = pd.DataFrame(twenty_train, columns=['data', 'target']).replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True)
twenty_test = pd.DataFrame(twenty_test, columns=['data', 'target']).replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True)

In [4]:
def stemming(data):
    porter_stemmer = PorterStemmer()
    nltk_tokens = word_tokenize(data)
    line = ''
    for word in nltk_tokens:
        line += ' ' + porter_stemmer.stem(word)
    return line

twenty_train.insert(loc=1, column='data_stemmed', value=twenty_train['data'].apply(lambda text: stemming(text)))
twenty_test.insert(loc=1, column='data_stemmed', value=twenty_test['data'].apply(lambda text: stemming(text)))

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics

from sklearn.utils._testing import ignore_warnings 
from sklearn.exceptions import FitFailedWarning, ConvergenceWarning 

In [6]:
%%time
parameters = {
    'KNeighborsClassifier': {
        'vect__max_features': (1000,5000,10000),
        'vect__stop_words': ('english', None),
        'tfidf__use_idf': (True, False),
        'clf__n_neighbors': (1, 3, 5, 10),
        'clf__p': (1, 2)
    },
    'DecisionTreeClassifier': {
        'vect__max_features': (1000,5000,10000),
        'vect__stop_words': ('english', None),
        'tfidf__use_idf': (True, False),
        'clf__criterion': ('gini', 'entropy'),
        'clf__max_depth': [*range(1,5,1), *range(5,101,20)]
    },
    'LinearSVC': [{
        'vect__max_features': (1000,5000,10000),
        'vect__stop_words': ('english', None),
        'tfidf__use_idf': (True, False),
        'clf__loss': ['squared_hinge'],
        'clf__penalty': ('l1', 'l2')
    },
        {
        'vect__max_features': (1000,5000,10000),
        'vect__stop_words': ('english', None),
        'tfidf__use_idf': (True, False),
        'clf__loss': ['hinge'],
        'clf__penalty': ['l2']
    }],
}

gs = {}
for clf, param in parameters.items():
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', eval(clf)())
    ])
    gs[clf] = GridSearchCV(text_clf, param, n_jobs=-1, error_score=0.0)
    gs[clf].fit(X = twenty_train['data'], y = twenty_train['target'])

60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to 0.0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1185, in _fit_liblinear
   

Wall time: 5min 45s


In [7]:
for clf, param in parameters.items():
    predicted = gs[clf].predict(twenty_test['data'])
    print(metrics.classification_report(twenty_test.target, predicted, target_names=categories))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


                         precision    recall  f1-score   support

comp.os.ms-windows.misc       0.64      0.37      0.46       394
  comp.sys.mac.hardware       0.61      0.28      0.38       385
              sci.space       0.44      0.86      0.58       394

               accuracy                           0.50      1173
              macro avg       0.56      0.50      0.48      1173
           weighted avg       0.56      0.50      0.48      1173

                         precision    recall  f1-score   support

comp.os.ms-windows.misc       0.83      0.68      0.75       394
  comp.sys.mac.hardware       0.80      0.66      0.72       385
              sci.space       0.66      0.90      0.77       394

               accuracy                           0.75      1173
              macro avg       0.77      0.75      0.75      1173
           weighted avg       0.77      0.75      0.75      1173

                         precision    recall  f1-score   support

comp.os.ms-windows

In [8]:
r = {}
def highlight_max(x, color):

    return np.where(x == np.nanmax(x.to_numpy()), f"color: {color};", None)

total_style = pd.Series("font-weight: bold;", index=[1])

for clf, param in parameters.items():
    predicted = gs[clf].predict(twenty_test['data'])
    
    pd.DataFrame(gs[clf].cv_results_).to_excel('all' + clf + '.xlsx')
    pd.DataFrame(classification_report(predicted, twenty_test.target, output_dict=True)).to_excel('best' + clf + '.xlsx')
    

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
