In [1]:
# 뉴스 분류하기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(2022)

In [2]:
# Data Load
from sklearn.datasets import fetch_20newsgroups

newsgroup = fetch_20newsgroups()

In [3]:
data, target = newsgroup["data"], newsgroup["target"]

In [4]:
# Data 확인
print(data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [5]:
target[0]

7

In [6]:
newsgroup["target_names"]

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
# 전체 Newsgroup의 개수
len(newsgroup["target_names"])

20

In [8]:
text = pd.Series(data, name="text")
target = pd.Series(target, name="target")

In [9]:
# text와 target data를 dataframe 하나로 묶음
df = pd.concat([text, target], 1)

In [10]:
df

Unnamed: 0,text,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [11]:
df.target.value_counts().sort_index()

0     480
1     584
2     591
3     590
4     578
5     593
6     585
7     594
8     598
9     597
10    600
11    595
12    591
13    594
14    593
15    599
16    546
17    564
18    465
19    377
Name: target, dtype: int64

In [12]:
df.query("16 <= target <= 19")

Unnamed: 0,text,target
5,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...,16
11,From: david@terminus.ericsson.se (David Bold)\...,19
33,From: ayr1@cunixa.cc.columbia.edu (Amir Y Rose...,17
34,From: joec@hilbert.cyprs.rain.com ( Joe Cipale...,18
39,From: bressler@iftccu.ca.boeing.com (Rick Bres...,16
...,...,...
11277,From: bob1@cos.com (Bob Blackshaw)\nSubject: R...,17
11280,From: jake@bony1.bony.com (Jake Livni)\nSubjec...,17
11299,From: 2120788@hydra.maths.unsw.EDU.AU ()\nSubj...,17
11304,From: Pegasus@aaa.uoregon.edu (Pegasus)\nSubje...,19


In [13]:
# 마지막 4개의 data만 사용
df_sample = df.query("16 <= target <= 19")

In [14]:
data = df_sample.text
target = df_sample.target

In [15]:
np.array(data).shape

(1952,)

In [16]:
# Data Split
from sklearn.model_selection import train_test_split

train_data, test_data, train_target, test_target = train_test_split(
    data, target, train_size=0.7, random_state=2021
)

In [17]:
# Count Vectorize
# 자연어 데이터를 모델 학습에 사용하기 위해 숫자로 변환
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sclab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
# 뉴스에 모두 등장한 단어를 사용
cnt_vectorizer = CountVectorizer(tokenizer=word_tokenize)
cnt_vectorizer.fit(train_data)



CountVectorizer(tokenizer=<function word_tokenize at 0x0000022096C8E9D0>)

In [19]:
# 전체 단어 수
len(cnt_vectorizer.vocabulary_)

32012

In [20]:
# 최초 10개의 뉴스에서 등장한 단어 사용
cnt_vectorizer = CountVectorizer(tokenizer=word_tokenize, min_df=10)
cnt_vectorizer.fit(train_data)

CountVectorizer(min_df=10,
                tokenizer=<function word_tokenize at 0x0000022096C8E9D0>)

In [21]:
# 최초 10개 뉴스의 단어 수
len(cnt_vectorizer.vocabulary_)

4244

In [22]:
train_matrix = cnt_vectorizer.transform(train_data)
test_matrix = cnt_vectorizer.transform(test_data)

In [25]:
# XGBoost
import xgboost as xgb

xgb_clf = xgb.XGBClassifier()

In [26]:
# 학습
xgb_clf.fit(train_matrix, train_target)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [27]:
# 예측
xgb_train_pred = xgb_clf.predict(train_matrix)
xgb_test_pred = xgb_clf.predict(test_matrix)

In [28]:
# 평가
from sklearn.metrics import accuracy_score

xgb_train_acc = accuracy_score(train_target, xgb_train_pred)
xgb_test_acc = accuracy_score(test_target, xgb_test_pred)

In [29]:
print(f"XGBoost Train accuracy is {xgb_train_acc:.4f}")
print(f"XGBoost Test accuracy is {xgb_test_acc:.4f}")

XGBoost Train accuracy is 1.0000
XGBoost Test accuracy is 0.9198


In [30]:
# Light GBM
import lightgbm as lgb

lgb_clf = lgb.LGBMClassifier()

In [31]:
# 학습
train_matrix

<1366x4244 sparse matrix of type '<class 'numpy.int64'>'
	with 237328 stored elements in Compressed Sparse Row format>

In [32]:
train_matrix.toarray()

array([[1, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 6, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 2, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
lgb_clf.fit(train_matrix.toarray(), train_target)

LGBMClassifier()

In [34]:
# 예측
lgb_train_pred = lgb_clf.predict(train_matrix.toarray())
lgb_test_pred = lgb_clf.predict(test_matrix.toarray())

In [35]:
# 평가
lgb_train_acc = accuracy_score(train_target, lgb_train_pred)
lgb_test_acc = accuracy_score(test_target, lgb_test_pred)

In [36]:
print(f"Light Boost train accuracy is {lgb_train_acc:.4f}")
print(f"Light Boost test accuracy is {lgb_test_acc:.4f}")

Light Boost train accuracy is 1.0000
Light Boost test accuracy is 0.9283


In [37]:
# CatBoost
import catboost as cb

cb_clf = cb.CatBoostClassifier()

In [38]:
# 학습
cb_clf.fit(train_matrix, train_target, verbose=False)

<catboost.core.CatBoostClassifier at 0x2209ae12f70>

In [39]:
# 예측
cb_train_pred = cb_clf.predict(train_matrix)
cb_test_pred = cb_clf.predict(test_matrix)

In [40]:
# 평가
cb_train_acc = accuracy_score(train_target, cb_train_pred)
cb_test_acc = accuracy_score(test_target, cb_test_pred)

In [41]:
print(f"Cat Boost train accuracy is {cb_train_acc:.4f}")
print(f"Cat Boost test accuracy is {cb_test_acc:.4f}")

Cat Boost train accuracy is 1.0000
Cat Boost test accuracy is 0.9420


In [42]:
# 비교
print(f"XGBoost test accuray is {xgb_test_acc:.4f}")
print(f"Light Boost test accuray is {lgb_test_acc:.4f}")
print(f"Cat Boost test accuray is {cb_test_acc:.4f}")

XGBoost test accuray is 0.9198
Light Boost test accuray is 0.9283
Cat Boost test accuray is 0.9420
