# 20 뉴스 그룹 분류

In [3]:
import numpy as np 
import pandas as pd

In [4]:
from sklearn.datasets import fetch_20newsgroups

In [5]:
news_data = fetch_20newsgroups(subset='all', random_state=2021)

In [6]:
print(news_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

### 데이터 탐색

In [7]:
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [8]:
dir(news_data)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [9]:
news_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [10]:
# value_counts()는 어떤 컬럼/Series의 unique value들을 count해주는 함수입니다
# 결과는 인덱스가 unique value들이고, 값은 count가 들어가 있는 Series입니다.

In [11]:
pd.Series(news_data.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [12]:
print(news_data.data[0])

From: dagibbs@quantum.qnx.com (David Gibbs)
Subject: Re: Countersteering sans Hands
Organization: QNX Software Systems, Ltd.
Lines: 22

In article <1993Apr20.203344.8417@cs.cornell.edu> karr@cs.cornell.edu (David Karr) writes:
>In article <Clarke.6.735328328@bdrc.bd.com> Clarke@bdrc.bd.com (Richard Clarke) writes:
>>So how do I steer when my hands aren't on the bars? (Open Budweiser in left 
>>hand, Camel cigarette in the right, no feet allowed.) 
>
>>If I lean, and the 
>>bike turns, am I countersteering?
>
>No, the bars would turn only *toward* the direction of turn in
>no-hands steering.

Just in case the original poster was looking for a serious answer,
I'll supply one.

Yes, even when steering no hands you do something quite similar
to countersteering.  Basically to turn left, you to a quick wiggle
of the bike to the right first, causing a counteracting lean to
occur to the left.  It is a lot more difficult to do on a motorcycle
than a bicycle though, because of the extra weight. 

### 훈련/테스트용 데이터 추출

In [13]:
train_news =fetch_20newsgroups(subset='train', random_state=2021,
                                remove=('headers','footers','quotes'))

X_train = train_news.data 
y_train = train_news.target

In [14]:
train_news.target[0] ,train_news.target_names[train_news.target[0]]

(19, 'talk.religion.misc')

In [15]:
test_news = fetch_20newsgroups(subset='test', random_state=2021,
            remove=('headers','footers','quotes'))

X_test = test_news.data
y_test = test_news.target

In [16]:
len(X_train), len(X_test)

(11314, 7532)

### 피쳐 벡터화 변환과 머신러닝 모델 학습/예측/평가

- Case1. CountVectorizer + LogisticRegression

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)
# train 데이터셋 내에서 train/test와 분리

In [18]:
X_train_cv.shape, X_test_cv.shape

((11314, 101631), (7532, 101631))

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
lr = LogisticRegression(max_iter=300)
lr.fit(X_train_cv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=300)

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
pred = lr.predict(X_test_cv)
accuracy_score(y_test, pred)

0.5966542750929368

In [23]:
y_test[:5], pred[:5]

(array([13, 11,  9,  6, 19]), array([13, 12,  9,  6, 13]))

- Case2: Tfidf Vectorizer

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
tvect.fit(X_train)

X_train_tf = tvect.transform(X_train)
X_test_tf = tvect.transform(X_test)

In [25]:
from sklearn.linear_model import LogisticRegression
lf = LogisticRegression()

In [26]:
lf.fit(X_train_tf, y_train)

LogisticRegression()

In [27]:
y_test_pred = lf.predict(X_test_tf)

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)
#67%의 정확도

0.6736590546999469

In [29]:
# Vectorizer도 하이퍼파라미터..

- Case3: stop_words filtering, ngram_range=(1,2), max_df = 300

In [30]:
tvect2 = TfidfVectorizer(ngram_range=(1,2), max_df=300, stop_words='english')

tvect2.fit(X_train)

X_train_tf2 = tvect2.transform(X_train)
X_test_tf2 = tvect2.transform(X_test)

In [31]:
lr = LogisticRegression(max_iter=300)
lr.fit(X_train_tf2, y_train)

LogisticRegression(max_iter=300)

In [32]:
pred = lr.predict(X_test_tf2)

In [33]:
accuracy_score(y_test, pred)

0.6922464152947424

- Case4. Case3에서 LogisticRegression C값을 10으로

In [34]:
#C값조절

# lr = LogisticRegression(max_iter=300, C=10)
# lr.fit(X_train_tf2, y_train)
# pred2 = lr.predict(X_test_tf2)
# accuracy_score(y_test, pred2)

### Pipleline과 GridSearchCV를 통한 하이퍼 파라미터 튜닝

In [35]:
from sklearn.pipeline import Pipeline 

pipeline = Pipeline([
    ('tvect', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression())

])

In [36]:
params = {
    'tvect__ngram_range': [(1,1), (1,2)],
    'tvect__max_df': [300, 700],
    'lr__C': [1,10]
}

In [37]:
from sklearn.model_selection import GridSearchCV

grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
            scoring = 'accuracy', verbose=1, n_jobs= -1)

In [38]:
grid_pipe.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


KeyboardInterrupt: 

In [2]:
pred = grid_pipe.best_estimator_(X_test)
accuracy_score(y_test, pred)

NameError: name 'grid_pipe' is not defined