In [4]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [3]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
X = newsgroups.data
y = newsgroups.target

In [6]:
vectorizer = TfidfVectorizer()
X_vectorizer = vectorizer.fit_transform(X)

In [7]:
features = vectorizer.get_feature_names()

In [8]:
# подбор лучшего параметра С для модели SVC
grid = {'C': np.power(10.0, np.arange(-5, 6))} # задаем С от 10^-5 до 10^5
cv = KFold(n_splits=5, shuffle=True, random_state=241) # объект разбивающий тренировочные данные на 5 частей для кросс валидации
clf = SVC(kernel='linear', random_state=241) # непосредственно классификатор
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv) # объект подбирающий лучшее С для классификатора метрика точность кросс валидация
gs.fit(X_vectorizer, y) #тренируем

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=241,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [9]:
gs.best_params_ # атрибут с лучшим С

{'C': 1.0}

In [10]:
# best_estimator_ - классификатор с лучшими параметрами, coef_ веса для признаков
# при этом best_estimator_ атрибут GridSearchCV а coef_ атрибут SVC
results = gs.best_estimator_.coef_ 

In [16]:
row = results.getrow(0).toarray()[0].ravel() #ХЗ???
top_ten_indicies = np.argsort(abs(row))[-10:]
top_ten_values = row[top_ten_indicies]

for a in top_ten_indicies:
    print(features[a])

sci
keith
bible
religion
sky
moon
atheists
atheism
god
space


In [17]:
answer = []
for a in top_ten_indicies:
    print(a, ' ', features[a])
    answer = np.append(answer, features[a])

answer.sort()
print(answer)

22936   sci
15606   keith
5776   bible
21850   religion
23673   sky
17802   moon
5093   atheists
5088   atheism
12871   god
24019   space
['atheism' 'atheists' 'bible' 'god' 'keith' 'moon' 'religion' 'sci' 'sky'
 'space']
