# AI 실습 및 응용 - SVM Topic Classification

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['comp.graphics', 'sci.space']
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

In [3]:
categories

['comp.graphics', 'sci.space']

In [4]:
data_train

{'data': ['From: ab@nova.cc.purdue.edu (Allen B)\nSubject: Re: thining algorithm\nOrganization: Purdue University\nLines: 15\n\nIn article <1q7615INNmi@shelley.u.washington.edu> kshin@stein.u.washington.edu  \n(Kevin Shin) writes:\n> I am trying obtain program to preprocess handwriting characters.\n> Like thining algorithm, graph alogrithm.\n> Do anyone know where I can obtain those?\n\nI usually use "Algorithms for graphics and image processing" by\nTheodosios Pavlidis, but other people here got them same idea and now\n3 of 4 copies in the libraries have been stolen!\n\nAnother reference is "Digital Image Processing" by Gonzalez and\nWintz/Wood, which is widely available but a little expensive ($55\nhere- I just checked today).\n\nab\n',
  "From: stephens@geod.emr.ca (Dave Stephenson)\nSubject: Re: Clementine Science Team Selected\nNntp-Posting-Host: ngis.geod.emr.ca\nOrganization: Dept. of Energy, Mines, and Resources, Ottawa\nLines: 32\n\nnickh@CS.CMU.EDU (Nick Haines) writes:\n\n>I

In [5]:
data_test

{'data': ['From: teezee@netcom.com (TAMOOR A. ZAIDI)\nSubject: Hall Generators from USSR\nKeywords: hall generators,thrusters,USSR,JPL\nOrganization: NETCOM On-line Communication Services (408 241-9760 guest)\nLines: 21\n\nHi Folks,\n\n              Last year America bought two  "Hall Generators" which are\nused as thrusters for space vehicles from former USSR,if I could recall\ncorrectly these devices were sent to JPL,Pasadena labs for testing and\nevaluation.\n     \n              I am just curious to know  how these devices work and what\nwhat principle is involved .what became of them.There was also some\ncontroversy that the Russian actually cheated,sold inferior devices and\nnot the one they use in there space vehicles.\n\nAny info will be appreciated...\n  ok   {                         Thank{ in advance...\nTamoor A Zaidi\nLockheed Commercial Aircraft Center\nNorton AFB,San Bernardino\n\nteezee@netcom.com\nde244@cleveland.freenet.edu\n\n',
  'From: henry@zoo.toronto.edu (Henry 

In [6]:
import nltk
nltk.download('names')
nltk.download('wordnet')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\82103\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\82103\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

In [8]:
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

In [9]:
def clean_text(docs):
    clean_docs = []
    for doc in docs:
        lemmatized_list = [lemmatizer.lemmatize(word.lower())
                          for word in doc.split()
                          if word.isalpha() and word not in all_names]
        clean_docs.append(''.join(lemmatized_list))
    return clean_docs

In [10]:
cleanded_train = clean_text(data_train.data)
label_train = data_train.target

In [11]:
cleaned_test = clean_text(data_test.data)
label_test = data_test.target

In [12]:
len(label_train), len(label_test)

(1177, 783)

In [13]:
from collections import Counter
Counter(label_train)

Counter({0: 584, 1: 593})

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(sublinear_tf = True, max_df=0.5, stop_words = 'english', max_features=8000)
term_docs_train = tfidf_vectorizer.fit_transform(cleanded_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [15]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', C = 1.0, random_state = 42)
svm.fit(term_docs_train, label_train)

SVC(kernel='linear', random_state=42)

In [16]:
accuracy = svm.score(term_docs_test, label_test)
print('The accuracy on testing set is : {0:.1f}%'.format(accuracy*100))

The accuracy on testing set is : 50.3%
