In [110]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer as TV
from sklearn.model_selection import GridSearchCV
from sklearn import svm
import numpy as np
from sklearn.model_selection import KFold

In [111]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

#### Vectorizing of the text corpus

In [124]:
corpus = newsgroups.data
vectorizer = TV()
X = vectorizer.fit_transform(corpus)
y = newsgroups.target
print(X)
print(y)
#print(vectorizer.get_feature_names())


  (0, 7135)	0.09177291452449737
  (0, 4228)	0.09732962706472328
  (0, 7730)	0.02575006576343587
  (0, 10229)	0.07116022661373511
  (0, 19790)	0.13853387972005007
  (0, 15344)	0.05308005187089393
  (0, 20738)	0.06612072788446571
  (0, 23930)	0.025879967583209106
  (0, 7156)	0.05954839972168062
  (0, 17180)	0.08346610253642223
  (0, 21711)	0.09732962706472328
  (0, 26651)	0.04831465306608014
  (0, 23767)	0.09427154747892712
  (0, 18131)	0.04803011953672212
  (0, 27676)	0.0271693788925225
  (0, 5591)	0.07750026888717974
  (0, 16195)	0.04900248039624845
  (0, 4594)	0.029268293013519193
  (0, 21315)	0.08966035046275332
  (0, 5542)	0.1724324039685429
  (0, 19013)	0.052035390940301
  (0, 18071)	0.029078607779420725
  (0, 13384)	0.09427154747892712
  (0, 9776)	0.05697744713041564
  (0, 27786)	0.09732962706472328
  :	:
  (1785, 27676)	0.04497597829354177
  (1785, 18071)	0.024068250464378153
  (1785, 28273)	0.02370385618179079
  (1785, 18954)	0.016313452448975245
  (1785, 12908)	0.06033099767146

#### Searching optimal c for svc via GridSearchCV

In [113]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = svm.SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X, y)


GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=241, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [114]:
gs.best_params_


{'C': 1.0}

#### Training with found parameter

In [115]:
clf = svm.SVC(kernel='linear', random_state=241)
clf.fit(X,y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=241, shrinking=True, tol=0.001,
    verbose=False)

#### From trained classificator we get matrix contained weights of features(words). First we found indexes of top10 features according to their weights

In [116]:
coefs_matrix = clf.coef_
#print(coefs_matrix)
coef_arr = coefs_matrix.toarray()[0]
coef_arr = [abs(i) for i in coef_arr]


def find_indexes_top_x(array, x=10):
    sorted_array = np.sort(array)[::-1][:x]
    indexes = [list(array).index(i) for i in sorted_array]
    return indexes 

indexes = find_indexes_top_x(array=coef_arr)
indexes
    
    

[24019, 12871, 5088, 5093, 17802, 23673, 21850, 5776, 15606, 22936]

#### Decoding found indexes via vectorizer to words

In [117]:
feature_mapping = vectorizer.get_feature_names()

In [118]:
words = []
for i in indexes:
    words.append(feature_mapping[i])
words

['space',
 'god',
 'atheism',
 'atheists',
 'moon',
 'sky',
 'religion',
 'bible',
 'keith',
 'sci']

In [119]:
words.sort()
words


['atheism',
 'atheists',
 'bible',
 'god',
 'keith',
 'moon',
 'religion',
 'sci',
 'sky',
 'space']