# 20 뉴스그룹 분류

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_State = 2021)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


## 데이터 탐색

In [3]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [6]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
pd.Series(news.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [9]:
len(news.data)

18846

In [10]:
print(news.data[0])

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




## 훈련/테스트용 데이터 추출

In [19]:
train_news = fetch_20newsgroups(
    subset='train', random_state = 2021,
    remove = ('headers', 'footers', 'quotes')
)
len(train_news.data)

11314

In [20]:
test_news = fetch_20newsgroups(
    subset='test', random_state = 2021,
    remove = ('headers', 'footers', 'quotes')
)
len(test_news.data)

7532

In [15]:
print(train_news.data)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## 텍스트 데이터에 대해서 전처리

In [21]:
train_df = pd.DataFrame({'article': train_news.data})
test_df = pd.DataFrame({'article': test_news.data})

- train dataset

In [22]:
# 특수문자 제거
train_df['article'] = train_df.article.str.replace('[^A-Za-z]', ' ')
train_df['article'][1]

' Is it possible to do a  wheelie  on a motorcycle with shaft drive   yes  '

In [24]:
# 길이가 3 이하인 단어 제거
train_df['article'] = train_df.article.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
train_df.article[1]

'possible wheelie motorcycle with shaft drive'

In [25]:
# 소문자로 변환
train_df['article'] = train_df.article.apply(lambda x: x.lower())

- 테스트 데이터셋

In [26]:
# 특수문자 제거
test_df['article'] = test_df.article.str.replace('[^A-Za-z]', ' ')
test_df['article'][1]
# 길이가 3 이하인 단어 제거
test_df['article'] = test_df.article.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
test_df.article[1]
# 소문자로 변환
test_df['article'] = test_df.article.apply(lambda x: x.lower())

## 텍스트 변환

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(train_df.article)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [31]:
X_train = tvect.transform(train_df.article)
X_test = tvect.transform(test_df.article)
X_train.shape, X_test.shape

((11314, 64133), (7532, 64133))

In [32]:
y_train = train_news.target
y_test = test_news.target

## 훈련/예측/평가

In [33]:
# Support Vector Machine의 Classfier 사용
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [34]:
pred = svc.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.6488316516197558