### Support Vector Machine (SVM). Determine the main topic of the news  

In [1]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import numpy as np 
import pandas as pd

newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [2]:
# Texts
X = newsgroups.data
# Number of class
y = newsgroups.target

vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)
# Find the best value for C regularization parameter
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X_vec, y)

In [3]:
# Best parameter C
C_best = gs.best_params_
# Use the best C in SVM
clf_best = SVC(C=C_best['C'], random_state=241, kernel='linear')
clf_best.fit(X_vec, y)

SVC(kernel='linear', random_state=241)

In [40]:
# Find 10 words with the biggest value
names_arr = np.array(vectorizer.get_feature_names())
coef_mat = pd.Series(clf_best.coef_.data, index=names_arr[clf_best.coef_.indices],
                    name='Coefficients')

words = coef_mat.abs().sort_values(ascending=False).head(10)
words = words.index.sort_values(ascending=True).to_numpy()

# txt_file = open('answer_text.txt', 'w')
# for elem in words:
#     txt_file.write(elem+' ')
# txt_file.close()
