# Text classification using Naive Bayes Classifier
- https://wikidocs.net/22892

## Import

In [10]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer #text를 입력받아 BoW생성 후 리턴
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB # 다항분포 나이브 베이즈 모델
from sklearn.metrics import accuracy_score #정확도 계산

## Load dataset

In [3]:
newsdata = fetch_20newsgroups(subset="train") #train데이터만 가져오기
print(newsdata.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [6]:
len(newsdata.data), len(newsdata.filenames), len(newsdata.target_names), len(newsdata.target), len(newsdata.DESCR)

(11314, 11314, 20, 11314, 9535)

In [7]:
# Categories
newsdata.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [9]:
newsdata.data[0], newsdata.target[0], newsdata.target_names[newsdata.target[0]]

("From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 7,
 'rec.autos')

## Text preprocessing

In [11]:
dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(newsdata.data)
print(x_train_dtm.shape) # (샘플의수(문서의수), 단어의 수)

(11314, 130107)


In [15]:
print(x_train_dtm)

  (0, 4605)	1
  (0, 16574)	1
  (0, 18299)	1
  (0, 26073)	1
  (0, 27436)	1
  (0, 28615)	2
  (0, 32311)	1
  (0, 34181)	1
  (0, 34995)	1
  (0, 35187)	1
  (0, 35612)	1
  (0, 35983)	1
  (0, 37433)	1
  (0, 37565)	1
  (0, 37780)	5
  (0, 40998)	1
  (0, 42876)	1
  (0, 45295)	1
  (0, 48618)	1
  (0, 48620)	1
  (0, 50111)	1
  (0, 50527)	2
  (0, 51730)	1
  (0, 51793)	1
  (0, 56979)	3
  :	:
  (11313, 89860)	1
  (11313, 90252)	1
  (11313, 90379)	1
  (11313, 90946)	1
  (11313, 92218)	2
  (11313, 94291)	1
  (11313, 94524)	1
  (11313, 95162)	1
  (11313, 101950)	1
  (11313, 105818)	1
  (11313, 106209)	1
  (11313, 106271)	1
  (11313, 107339)	1
  (11313, 109661)	1
  (11313, 110796)	2
  (11313, 111322)	1
  (11313, 111695)	1
  (11313, 113435)	1
  (11313, 113812)	1
  (11313, 115621)	1
  (11313, 116027)	1
  (11313, 117033)	1
  (11313, 119714)	1
  (11313, 124103)	1
  (11313, 124370)	1


In [13]:
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(x_train_dtm)
print(tfidf_matrix.shape)

(11314, 130107)


In [16]:
print(tfidf_matrix)

  (0, 128420)	0.04278499079283093
  (0, 128402)	0.05922294083277842
  (0, 128026)	0.060622095889758885
  (0, 124931)	0.08882569909852546
  (0, 124031)	0.10798795154169122
  (0, 123989)	0.08207027465330353
  (0, 123984)	0.036854292634593756
  (0, 123796)	0.049437556160455476
  (0, 123292)	0.14534718515938805
  (0, 123162)	0.2597090245735688
  (0, 118983)	0.037085978050619146
  (0, 118280)	0.2118680720828169
  (0, 115475)	0.042472629883573
  (0, 114731)	0.14447275512784058
  (0, 114688)	0.06214070986309586
  (0, 114579)	0.03671830826216751
  (0, 114455)	0.12287762616208957
  (0, 114428)	0.05511105154696676
  (0, 113986)	0.17691750674853082
  (0, 111322)	0.01915671802495043
  (0, 109581)	0.10809248404447917
  (0, 109271)	0.10844724822064673
  (0, 108252)	0.07526015712540636
  (0, 106116)	0.09869734624201922
  (0, 104813)	0.08462829788929047
  :	:
  (11313, 62696)	0.06213004660468942
  (11313, 60910)	0.34638730155641734
  (11313, 60803)	0.07995422310508192
  (11313, 56979)	0.03970306835789

## Train naive bayes classifier

In [18]:
mod = MultinomialNB()
mod.fit(tfidf_matrix, newsdata.target)
# alpha=1.0 라플라스 스무딩 적용됨

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Evaluate

In [19]:
newsdata_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test_dtm = dtmvector.transform(newsdata_test.data)
tfidfv_test = tfidf_transformer.transform(X_test_dtm)

In [20]:
tfidfv_test.shape

(7532, 130107)

In [21]:
predicted = mod.predict(tfidfv_test)
print(f"정확도: {accuracy_score(newsdata_test.target, predicted)}")

정확도: 0.7738980350504514
