In [1]:
"""
표제어(Lemmatization) 추출: 서로 형태는 다르지만, root 단어를 가지로 비교해서, 
                            전체적으로 단어의 개수를 줄이자
                            am, are, is, was, were... => be(표제어)
                            
형태소: stem(어간: 단어의 의미), affix(접사: 부가적 의미)
형태소 파싱: 어간, 접사를 분리하는 작업
dog(독립형태소)
dogs=dog(어간)+s(접사)

WordNetLemmatizer: NLTK에 표제어 추출 도구
"""
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [5]:
# wnl.lemmatize('watched') #watched
wnl.lemmatize('watched','v')
wnl.lemmatize('has','v')
wnl.lemmatize('dies','v')

'die'

In [6]:
#어간 추출
text = "Python is an interpreted, high-level, general-purpose programming language."
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [7]:
ps = PorterStemmer()
words = word_tokenize(text)
print(words)

['Python', 'is', 'an', 'interpreted', ',', 'high-level', ',', 'general-purpose', 'programming', 'language', '.']


In [9]:
print([ps.stem(w) for w in words])

['python', 'is', 'an', 'interpret', ',', 'high-level', ',', 'general-purpos', 'program', 'languag', '.']


In [12]:
print(ps.stem("electricical"))
print(ps.stem("formalize"))
#구글: 마틴 포터 or 토퍼스태머 검색

electric
formal


In [14]:
ps.stem('going') #go
ps.stem('gone') #gone
from nltk.stem import LancasterStemmer

In [18]:
ls = LancasterStemmer()
ls.stem('going') #going
ls.stem('gone') #gon
ls.stem('dies') #die

'die'

In [19]:
#불용어:stopwords
from nltk.corpus import stopwords

In [22]:
sw = stopwords.words('english')
ex = "Family is not an important thing. It's everything."
wt = word_tokenize(ex)
res=[]
for w in wt:
    if w not in sw: #stopwords가 아니라면
        res.append(w)
print(wt)
print(res)

['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']


In [26]:
# www.ranks.nl/stopwords/korean
#stopwords를 직접 지정하여 제거

ex="""
최근 코로나19로 인한 확진자 및 사망자가 증가하고 있습니다. 코로나19를 이겨냅시다
"""

stop_words = "인한 증가 최근 및"
stop_words = stop_words.split(" ")
wt = word_tokenize(ex) #공백문자로 분리
print(wt)
res=[]
for w in wt:
    if w not in stop_words:
        res.append(w)
print(res)

['최근', '코로나19로', '인한', '확진자', '및', '사망자가', '증가하고', '있습니다', '.', '코로나19를', '이겨냅시다']
['코로나19로', '확진자', '사망자가', '증가하고', '있습니다', '.', '코로나19를', '이겨냅시다']


In [27]:
from nltk.tokenize import * #모든 함수 import
text = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects."
text = sent_tokenize(text) #문장 단위로 tokenize #3개 문장
text

['Python is an interpreted, high-level, general-purpose programming language.',
 "Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace.",
 'Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.']

In [52]:
#모든 단어를 소문자, 불용어 제거, 길이가 2이하 제거
# print(sw)

voc={}
sentences=[]
for t in text:
    words = word_tokenize(t)
    res=[]
    for word in words:
        word = word.lower() #소문자 변환
        if word not in sw: #불용어 제거
            if len(word) > 2: #길이가 2이상인 단어
                res.append(word)
                if word not in voc: #word가 voc에 없으면 키 값을 생성,초기화
                    voc[word]=0
                voc[word]+=1 #word가 voc에 있으면 값에 1을 더해줌
    sentences.append(res)
                
# print(res)
# print(voc)
print(sentences) #[문단[문장1,문장2,문장3]]
#voc={'python':3, ...}  

[['python', 'interpreted', 'high-level', 'general-purpose', 'programming', 'language'], ['created', 'guido', 'van', 'rossum', 'first', 'released', '1991', 'python', 'design', 'philosophy', 'emphasizes', 'code', 'readability', 'notable', 'use', 'significant', 'whitespace'], ['language', 'constructs', 'object-oriented', 'approach', 'aim', 'help', 'programmers', 'write', 'clear', 'logical', 'code', 'small', 'large-scale', 'projects']]


In [41]:
vs = sorted(voc.items(), key=lambda x:x[1], reverse=True) #튜플로 묶어서 리스트 구성 => 정렬
vs

[('python', 2),
 ('language', 2),
 ('code', 2),
 ('interpreted', 1),
 ('high-level', 1),
 ('general-purpose', 1),
 ('programming', 1),
 ('created', 1),
 ('guido', 1),
 ('van', 1),
 ('rossum', 1),
 ('first', 1),
 ('released', 1),
 ('1991', 1),
 ('design', 1),
 ('philosophy', 1),
 ('emphasizes', 1),
 ('readability', 1),
 ('notable', 1),
 ('use', 1),
 ('significant', 1),
 ('whitespace', 1),
 ('constructs', 1),
 ('object-oriented', 1),
 ('approach', 1),
 ('aim', 1),
 ('help', 1),
 ('programmers', 1),
 ('write', 1),
 ('clear', 1),
 ('logical', 1),
 ('small', 1),
 ('large-scale', 1),
 ('projects', 1)]

In [43]:
wi={}
i=0
for w,f in vs:
    if f > 1: #언급된 빈도수가 최소 2이상인 경우
        i+=1
        wi[w] = i #index 부여
print(wi)

{'python': 1, 'language': 2, 'code': 3}


In [47]:
a = wi.items()
for w,i in a:
    print(w)

python
language
code


In [49]:
vocSize=2 #가장 많이 언급된 2개의 단어만 추출

#단어의 인덱스가 vocSize를 초과하는 단어 추출
wordFreq = [w for w,i in wi.items() if i>vocSize]
print(wordFreq)
for w in wordFreq:
    del wi[w]
#인덱스(index)가 3번 이상인 단어는 제거(1,2번만 추출)

['code']


In [50]:
wi

{'python': 1, 'language': 2}

In [None]:
#OOV(Out of Vocabulary: 단어집합에 없는 단어, 챗봇이 처음 보는 단어)
"""
영수:철수야 안녕? (입력 데이터, x)
철수:응 너도 안녕. (출력 데이터, y)
...
철수야 안녕? -> 모델 -> 응 너도 안녕.
"""
#개체명 인식: 이름을 이름이라고 인식하는 것

In [53]:
sentences

[['python',
  'interpreted',
  'high-level',
  'general-purpose',
  'programming',
  'language'],
 ['created',
  'guido',
  'van',
  'rossum',
  'first',
  'released',
  '1991',
  'python',
  'design',
  'philosophy',
  'emphasizes',
  'code',
  'readability',
  'notable',
  'use',
  'significant',
  'whitespace'],
 ['language',
  'constructs',
  'object-oriented',
  'approach',
  'aim',
  'help',
  'programmers',
  'write',
  'clear',
  'logical',
  'code',
  'small',
  'large-scale',
  'projects']]

In [54]:
#원핫인코딩
from konlpy.tag import Okt

In [57]:
okt = Okt()
tok = okt.morphs("나는 자연어 처리를 학습한다")
#원핫벡터: 단어 집합을 벡터로 표현하는 방식

In [58]:
w2i={}
for v in tok:
    if v not in w2i.keys():
        w2i[v] = len(w2i)
print(w2i)

{'나': 0, '는': 1, '자연어': 2, '처리': 3, '를': 4, '학습': 5, '한다': 6}


In [64]:
#'자연어' -> 원핫 -> 0010000
def ohe(w,w2i):
    ohv = [0]*len(w2i)
    index = w2i[w]
    ohv[index] = 1
    return ohv
    
print(ohe("자연어",w2i))

[0, 0, 1, 0, 0, 0, 0]


In [61]:
[0]*len(w2i)

[0, 0, 0, 0, 0, 0, 0]

In [70]:
#케라스 원핫인코딩: to_categorical()
text = "데이터 분석은 판다스 최고야 판다스 곰이야"
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
tok = Tokenizer()
tok.fit_on_texts([text])
print(tok.word_index)
#단어집합(voc)

{'판다스': 1, '데이터': 2, '분석은': 3, '최고야': 4, '곰이야': 5}


In [72]:
sample = "판다스 분석은 동물원에서 한다"
enc = tok.texts_to_sequences([sample])

In [73]:
to_categorical(enc)

array([[[0., 1., 0., 0.],
        [0., 0., 0., 1.]]], dtype=float32)

In [74]:
#단어 분리(BPE) => 기계번역
#학습과정에서 사용되지 않은 단어가 테스트과정에서
#입력되면 -> OOV문제 -> 제대로 모델이 동작X

In [None]:
# run-length 기법  aaaabbbaaaaa =>a4b3a5
# 허프만 트리(인코딩)를 이용한 압축
# a=>101, b=>10, c=>1101 ...
# BPE 알고리즘 => 단어 분리에 응용

In [None]:
# AAABDAAABAC
# BPE 압축
# 연속적인 글자 쌍(2글자)을 구성했을때, 가장 많이 등장
# 1) AA가 가장 많이 등장 => 다른 글자로 치환
# => 소문자 z로 치환
# zABDzABAC

# 2) AB가 가장 많이 등장 => 다른 글자로 치환
# => 소문자 y로 치환
# zyDzyAC

# 3) zy가 가장 많이 등장 => 다른 글자로 치환
# => 소문자 x로 치환
# xDxAC

In [75]:
###토픽 모델링 => LSA
from sklearn.datasets import fetch_20newsgroups

In [77]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=("header","footers","quotes"))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [78]:
documents = dataset.data
len(documents)  #11314건의 뉴스기사

11314

In [80]:
# type(documents)
documents[0]

'From: ab4z@Virginia.EDU ("Andi Beyer")\nSubject: Re: Israeli Terrorism\nOrganization: University of Virginia\nLines: 15'

In [84]:
dataset.target_names
dataset.DESCR



In [83]:
documents[1]

"From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSubject: Re: Amusing atheists and agnostics\nLines: 66\n\n\n\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [None]:
# 각 토픽과 가장 관련성이 높은 단어를 10개씩 출력
# topic1: 
# ~
# topic20

In [87]:
import pandas as pd
type(documents)
newsDf = pd.DataFrame({"document":documents})
newsDf

Unnamed: 0,document
0,"From: ab4z@Virginia.EDU (""Andi Beyer"")\nSubjec..."
1,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...
2,From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...
3,From: ray@ole.cdac.com (Ray Berry)\nSubject: C...
4,From: kkeller@mail.sas.upenn.edu (Keith Keller...
...,...
11309,From: adams@bellini.berkeley.edu (Adam L. Schw...
11310,From: levin@bbn.com (Joel B Levin)\nSubject: R...
11311,From: tedward@cs.cornell.edu (Edward [Ted] Fis...
11312,From: mori@volga.mfd.cs.fujitsu.co.jp (Tsuyosh...


In [89]:
#특수문자 제거(영문자를 제외)
newsDf['clean_doc'] = newsDf['document'].str.replace("[^a-zA-Z]"," ")

In [90]:
newsDf

Unnamed: 0,document,clean_doc
0,"From: ab4z@Virginia.EDU (""Andi Beyer"")\nSubjec...",From ab z Virginia EDU Andi Beyer Subject...
1,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...,From timmbake mcl ucsb edu Bake Timmons Sub...
2,From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...,From bc cleveland Freenet Edu Mark Ira Ka...
3,From: ray@ole.cdac.com (Ray Berry)\nSubject: C...,From ray ole cdac com Ray Berry Subject Cl...
4,From: kkeller@mail.sas.upenn.edu (Keith Keller...,From kkeller mail sas upenn edu Keith Keller...
...,...,...
11309,From: adams@bellini.berkeley.edu (Adam L. Schw...,From adams bellini berkeley edu Adam L Schw...
11310,From: levin@bbn.com (Joel B Levin)\nSubject: R...,From levin bbn com Joel B Levin Subject Re...
11311,From: tedward@cs.cornell.edu (Edward [Ted] Fis...,From tedward cs cornell edu Edward Ted Fis...
11312,From: mori@volga.mfd.cs.fujitsu.co.jp (Tsuyosh...,From mori volga mfd cs fujitsu co jp Tsuyosh...


In [92]:
#3글자 이하 단어 제거, 소문자 변환
newsDf['clean_doc'] = newsDf['clean_doc'].apply(lambda x:' '.join([w for w in x.split() if len(w)>3]))

In [93]:
#소문자 변환
newsDf['clean_doc'] = newsDf['clean_doc'].apply(lambda x:x.lower())

In [94]:
newsDf['clean_doc']

0        from virginia andi beyer subject israeli terro...
1        from timmbake ucsb bake timmons subject amusin...
2        from cleveland freenet mark kaufman subject re...
3        from cdac berry subject clipper business usual...
4        from kkeller mail upenn keith keller subject p...
                               ...                        
11309    from adams bellini berkeley adam schwartz subj...
11310    from levin joel levin subject selective placeb...
11311    from tedward cornell edward fischer subject be...
11312    from mori volga fujitsu tsuyoshi mori subject ...
11313    from marc yogi austin marc stephenson subject ...
Name: clean_doc, Length: 11314, dtype: object

In [98]:
#불용어 제거
sw = stopwords.words('english')
#토큰화
tokenizedDoc = newsDf['clean_doc'].apply(lambda x:x.split())

In [99]:
tokenizedDoc = tokenizedDoc.apply(lambda x: [item for item in x if item not in sw])

In [100]:
tokenizedDoc[1]

['timmbake',
 'ucsb',
 'bake',
 'timmons',
 'subject',
 'amusing',
 'atheists',
 'agnostics',
 'lines',
 'yeah',
 'expect',
 'people',
 'read',
 'actually',
 'accept',
 'hard',
 'atheism',
 'need',
 'little',
 'leap',
 'faith',
 'jimmy',
 'logic',
 'runs',
 'steam',
 'sorry',
 'pity',
 'sorry',
 'feelings',
 'denial',
 'faith',
 'need',
 'well',
 'pretend',
 'happily',
 'ever',
 'anyway',
 'maybe',
 'start',
 'newsgroup',
 'atheist',
 'hard',
 'bummin',
 'much',
 'forget',
 'flintstone',
 'chewables',
 'bake',
 'timmons']

In [102]:
#TF-DF 매트릭스 구성
#TFIDF는 토큰화가 안되어 있는 텍스트 데이터로 구성
#토큰화 <-> 역토큰화(토큰화 취소)
newsDf['clean_doc'][1]

'from timmbake ucsb bake timmons subject amusing atheists agnostics lines yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [103]:
#역토큰화
deTokenizedDoc=[]
for i in range(len(newsDf)):
    temp = ' '.join(tokenizedDoc[i])
    deTokenizedDoc.append(temp)

In [104]:
newsDf['clean_doc'] = deTokenizedDoc

In [105]:
newsDf['clean_doc'][1]

'timmbake ucsb bake timmons subject amusing atheists agnostics lines yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons'

In [106]:
#tfidf 행렬 구성
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer(stop_words='english',max_features=1000) #1000개 단어
res = vector.fit_transform(newsDf['clean_doc'])
res.shape

(11314, 1000)

In [107]:
res

<11314x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 332684 stored elements in Compressed Sparse Row format>

In [None]:
#svd-full, truncated 
#특이값분해
#행렬= U*S*VT
#절단된 SVD -> 차원 축소

In [109]:
#토픽 숫자: n_components
from sklearn.decomposition import TruncatedSVD
svdModel = TruncatedSVD(n_components=20)

In [110]:
svdModel.fit(res)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=5,
             random_state=None, tol=0.0)

In [112]:
import numpy as np
np.shape(svdModel.components_) #VT
#20개의 토픽과 1000개의 단어

(20, 1000)

In [113]:
terms = vector.get_feature_names() #1000개 단어
terms

['ability',
 'able',
 'accept',
 'access',
 'according',
 'account',
 'action',
 'actually',
 'added',
 'addition',
 'address',
 'administration',
 'advance',
 'advice',
 'agencies',
 'agree',
 'algorithm',
 'allow',
 'allowed',
 'allows',
 'amendment',
 'america',
 'american',
 'americans',
 'analysis',
 'andrew',
 'angeles',
 'anonymous',
 'answer',
 'answers',
 'anti',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'apple',
 'application',
 'applications',
 'apply',
 'appreciate',
 'appreciated',
 'approach',
 'appropriate',
 'april',
 'arab',
 'archive',
 'area',
 'areas',
 'argic',
 'argument',
 'arguments',
 'armenia',
 'armenian',
 'armenians',
 'arms',
 'army',
 'article',
 'articles',
 'asked',
 'asking',
 'assume',
 'atheism',
 'atheists',
 'attack',
 'attempt',
 'austin',
 'australia',
 'author',
 'authority',
 'available',
 'average',
 'avoid',
 'away',
 'banks',
 'base',
 'baseball',
 'based',
 'basic',
 'basically',
 'basis',
 'belief',
 'believe',
 'bell',
 'berkeley

In [114]:
def getTopic(c, fName, n=10):
    for i, t in enumerate(c):
        print("토픽 %d:" %(i+1), [(fName[i],t[i].round(5)) for i in t.argsort()[:-n-1:-1]])
    
getTopic(svdModel.components_,terms)

토픽 1: [('subject', 0.265), ('lines', 0.26495), ('organization', 0.26304), ('posting', 0.24983), ('nntp', 0.24466), ('host', 0.24465), ('university', 0.21389), ('distribution', 0.15693), ('reply', 0.12507), ('like', 0.12268)]
토픽 2: [('nntp', 0.3406), ('host', 0.33811), ('posting', 0.33318), ('university', 0.15258), ('organization', 0.12408), ('lines', 0.12285), ('subject', 0.12), ('distribution', 0.11668), ('reply', 0.05804), ('cwru', 0.04312)]
토픽 3: [('windows', 0.39288), ('card', 0.18356), ('file', 0.16128), ('thanks', 0.15623), ('drive', 0.15234), ('help', 0.12667), ('files', 0.12367), ('video', 0.11941), ('version', 0.11449), ('window', 0.11145)]
토픽 4: [('university', 0.42243), ('state', 0.2169), ('pitt', 0.20051), ('virginia', 0.19311), ('organization', 0.16167), ('lines', 0.1611), ('subject', 0.16071), ('gordon', 0.15877), ('banks', 0.15715), ('computer', 0.12301)]
토픽 5: [('pitt', 0.31337), ('gordon', 0.27551), ('banks', 0.26785), ('nasa', 0.16989), ('distribution', 0.16293), ('sc