# 1. 데이터 읽어 오기

In [1]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset='all', random_state=10)
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [2]:
news_data.filenames

array(['C:\\Users\\kys05\\scikit_learn_data\\20news_home\\20news-bydate-train\\comp.os.ms-windows.misc\\9549',
       'C:\\Users\\kys05\\scikit_learn_data\\20news_home\\20news-bydate-test\\comp.sys.mac.hardware\\52163',
       'C:\\Users\\kys05\\scikit_learn_data\\20news_home\\20news-bydate-test\\comp.sys.ibm.pc.hardware\\61041',
       ...,
       'C:\\Users\\kys05\\scikit_learn_data\\20news_home\\20news-bydate-test\\rec.sport.hockey\\54273',
       'C:\\Users\\kys05\\scikit_learn_data\\20news_home\\20news-bydate-train\\comp.sys.mac.hardware\\51734',
       'C:\\Users\\kys05\\scikit_learn_data\\20news_home\\20news-bydate-test\\comp.os.ms-windows.misc\\10065'],
      dtype='<U95')

In [4]:
print(news_data.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
print(news_data.target)

[ 2  4  3 ... 10  4  2]


In [6]:
print(news_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

Classes                     20
Samples total            18846
Dimensionality               1
Features                  text

# 2. 데이터 전처리

## 2.1 데이터 확인

In [9]:
import pandas as pd

print(pd.Series(news_data.target).value_counts().sort_index())
print(news_data.target_names)

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
Name: count, dtype: int64
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [10]:
print(news_data.data[0])

From: carlf@panix.com (Carl Fink)
Subject: Re: Panasonic KX-P1091i Driver?
Organization: PANIX Public Access Unix, NYC
Lines: 13

In <1993Apr16.040946.26896@netnews.noc.drexel.edu> brzyckmj@dunx1.ocs.drexel.edu (Mike) writes:

>Does anyone out there know if there are print drivers for Windows for the
>Panasonic KX-P1091i 9-pin dot matrix printer?

  I've been told that Panasonic has uploaded some to Compu$erve, but I
don't have a CIS account.  I just use the Epson FX-80 driver myself,
and it comes out very pretty (if very slowly) on my 1080i.
-- 
Carl Fink          carlf@panix.com, C.FINK4(GEnie), or CF427620I@LIUVAX.BITNET
  "Facts are stubborn things; and whatever may be our wishes, our
  inclinations, or the dictates of our passions, they cannot alter
  the state of facts and evidence"     --      John Adams



## 2.2 데이터 분류

In [5]:
from sklearn.datasets import fetch_20newsgroups
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=10)
X_train = train_news.data
y_train = train_news.target
print(type(X_train))

<class 'list'>


In [13]:
X_train[0]

'Someone in Canada asked me to send him some public domain DES file\nencryption code I have.  Is it legal for me to send it?\n\nThanx.\n--\nEschew Obfuscation\n\nRob deFriesse                    Mail:  rj@ri.cadre.com\nCadre Technologies Inc.          Phone:  (401) 351-5950\n222 Richmond St.                 Fax:    (401) 351-7380\nProvidence, RI  02903'

In [6]:
test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=10)
X_test = test_news.data
y_test = test_news.target

print(type(X_test))

<class 'list'>


In [17]:
len(X_train), len(X_test)

(11314, 7532)

## 2.3 데이터 벡터화

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect  = cnt_vect.transform(X_test)

X_train_cnt_vect.shape

(11314, 101631)

# 3. 모델 생성

## 3.1 모델 만들기

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train_cnt_vect, y_train)

## 3.2 예측

In [22]:
pred = lr_clf.predict(X_test_cnt_vect)
accuracy_score(y_test, pred)

0.5966542750929368

# 4. TF-IDF 기반 모델 생성(Quiz)

## 4.1 벡터화

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

## 4.2 모델 만들기

In [24]:
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train_tfidf_vect, y_train)

## 4.3 예측

In [25]:
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.6744556558682953

# 5. 전처리 후 모델 생성

## 5.1 데이터 벡터화

In [26]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

## 5.2 모델 만들기

In [27]:
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train_tfidf_vect, y_train)

In [28]:
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.6918481147105683

# 6. 하이퍼 파라미터

## 6.1 하이퍼 파라미터 적용

In [29]:
lr_clf = LogisticRegression(max_iter=1000, C=0.01)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.4877854487519915

In [30]:
lr_clf = LogisticRegression(max_iter=1000, C=0.1)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.6448486457780138

In [31]:
lr_clf = LogisticRegression(max_iter=1000, C=1)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.6918481147105683

In [32]:
lr_clf = LogisticRegression(max_iter=1000, C=5)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.6992830589484864

In [33]:
lr_clf = LogisticRegression(max_iter=1000, C=10)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.7014073287307488

In [34]:
lr_clf = LogisticRegression(max_iter=1000, C=20)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.7003451938396177

In [35]:
lr_clf = LogisticRegression(max_iter=1000, C=100)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.7008762612851832

In [37]:
lr_clf = LogisticRegression(max_iter=1000, C=10, penalty='l2')
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.7014073287307488

In [None]:
lr_clf = LogisticRegression(max_iter=1000, C=10, penalty='l1', solver='saga')
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

In [39]:
lr_clf = LogisticRegression(max_iter=1000, C=10, penalty='elasticnet', solver='saga', l1_ratio=0.5)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

> 기본 규제 : l2, 최적화 함수 없이 사용 가능

> l1, ElasticNet은 최적화 함수와 함께 사용 가능

> liblinear : 이진 분류 최적화 함수

> saga : 대용량 분류 최적화 함수

## 6.2 예측

In [None]:
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

# 7. 파이프라인(Pipeline)

> 지금까지의 작업 내용을 간략하게 정리하면 다음과 같다.
1. 데이터 전처리
2. 모델 생성
3. 예측

> 이러한 일련의 과정들을 하나의 파이프에서 물이 흘러가듯 표현하는 방식이 파이프라인이다.

## 7.1 기본 파이프 라인

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=300)),
    ('lr_clf', LogisticRegression(max_iter=1000, C=10))
])

pipeline.fit(X_train, y_train)
predict = pipeline.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, predict)

0.7014073287307488

## 7.2 하이퍼파라미터 추가

In [None]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression(max_iter=1000))
])

params = {
    'tfidf_vect__ngram_range' : [(1,1),(1,2),(1,3)],
    'tfidf_vect__max_df':[100,200,300],
    'lr_clf__C' : [1,5,10]
}

grid_cv = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv.fit(X_train, y_train)

print(grid_cv.best_params_)
print(grid_cv.best_scores_)