## CountVectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [2]:
text_data = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부 해야지']

In [4]:
count_vector = CountVectorizer()
count_vector.fit(text_data)
print(count_vector.vocabulary_)

{'나는': 2, '배가': 6, '고프다': 0, '내일': 3, '점심': 7, '뭐먹지': 5, '공부': 1, '해야겠다': 8, '먹고': 4, '해야지': 9}


In [7]:
display(text_data)
print(count_vector.transform(text_data).toarray())

tdm_count = count_vector.transform(text_data).toarray()
print(tdm_count)

['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부 해야지']

[[1 0 1 0 0 0 1 0 0 0]
 [0 0 0 1 0 1 0 1 0 0]
 [0 1 0 1 0 0 0 0 1 0]
 [0 1 0 0 1 0 0 1 0 1]]
[[1 0 1 0 0 0 1 0 0 0]
 [0 0 0 1 0 1 0 1 0 0]
 [0 1 0 1 0 0 0 0 1 0]
 [0 1 0 0 1 0 0 1 0 1]]


## TfidfVectorizer 
<p>
$$(Term Frequecy, Inerse Document Frequency :IDF = log(\frac{N}{1+dfx}) \; \; 사이킷런에서)$$

$$W_x,_y = tf_x,_y \; \times{} \; \; \; log(\frac{N}{df_x})$$<p>
$$tf_x,_y : frequency \; \; of \; \; x \; \;of \; \;y$$
$$df_x : number \; \; of \; \; documents  \; \;contaning \; \; x$$
$$N : total  \; \; number \; \; of \; \; documents$$

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(text_data)
print(count_vector.vocabulary_)


{'나는': 2, '배가': 6, '고프다': 0, '내일': 3, '점심': 7, '뭐먹지': 5, '공부': 1, '해야겠다': 8, '먹고': 4, '해야지': 9}


In [20]:
tfidf_vector = tfidf_vectorizer.transform(text_data)
tfidf_vector.toarray()

array([[0.57735027, 0.        , 0.57735027, 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.52640543, 0.        ,
        0.66767854, 0.        , 0.52640543, 0.        , 0.        ],
       [0.        , 0.52640543, 0.        , 0.52640543, 0.        ,
        0.        , 0.        , 0.        , 0.66767854, 0.        ],
       [0.        , 0.43779123, 0.        , 0.        , 0.55528266,
        0.        , 0.        , 0.43779123, 0.        , 0.55528266]])

## 자연어 토크나이징 도구
### 1. 영어토크나이징 라이브러리

In [21]:
!pip install nltk



In [22]:
import nltk

In [23]:
nltk.download('all-corpora') # 말뭉치 다운
nltk.download('punkt')

[nltk_data] Downloading collection 'all-corpora'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\

[nltk_data]    |   Package product_reviews_2 is already up-to-date!
[nltk_data]    | Downloading package propbank to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package propbank is already up-to-date!
[nltk_data]    | Downloading package pros_cons to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package pros_cons is already up-to-date!
[nltk_data]    | Downloading package ptb to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package ptb is already up-to-date!
[nltk_data]    | Downloading package qc to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package qc is already up-to-date!
[nltk_data]    | Downloading package reuters to
[nltk_data]    |     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]    |   Package reuters is already up-to-date!
[nltk_data]    | Downloading package rte to
[nltk_data]    |     C:\Users\frank\AppData\R

True

In [24]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [25]:
sentence = "Natural language processing (NLP) is a subfield of computer science, \
information engineering, and artificial intelligence concerned \
with the interactions between computers and human (natural) languages, \
in particular how to program computers to process and analyze \
large amounts of natural language data."

In [28]:
word_token = word_tokenize(sentence)
display(word_token)

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'subfield',
 'of',
 'computer',
 'science',
 ',',
 'information',
 'engineering',
 ',',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 '(',
 'natural',
 ')',
 'languages',
 ',',
 'in',
 'particular',
 'how',
 'to',
 'program',
 'computers',
 'to',
 'process',
 'and',
 'analyze',
 'large',
 'amounts',
 'of',
 'natural',
 'language',
 'data',
 '.']

In [27]:
sentence.split()

['Natural',
 'language',
 'processing',
 '(NLP)',
 'is',
 'a',
 'subfield',
 'of',
 'computer',
 'science,',
 'information',
 'engineering,',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 '(natural)',
 'languages,',
 'in',
 'particular',
 'how',
 'to',
 'program',
 'computers',
 'to',
 'process',
 'and',
 'analyze',
 'large',
 'amounts',
 'of',
 'natural',
 'language',
 'data.']

In [29]:
sentence_token = sent_tokenize(sentence)
display(sentence_token)

['Natural language processing (NLP) is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.']

### 2. 한글토크나이징 라이브러리

In [1]:
! java -version # java 확인

java version "1.8.0_371"
Java(TM) SE Runtime Environment (build 1.8.0_371-b11)
Java HotSpot(TM) Client VM (build 25.371-b11, mixed mode, sharing)


In [10]:
import sys
sys.version

'3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]'

In [2]:
! pip install ./JPype1-1.4.0-cp38-cp38-win_amd64.whl



In [9]:
! pip install konlpy





https://www.lfd.uci.edu/~gohlke/pythonlibs/#jpype

In [37]:
import konlpy
from konlpy.tag import Okt

In [38]:
okt = Okt()

In [42]:
text = "한글 자연어 처리는 재미있다 이제부터 열심히 해야지 ㅎㅎㅎ"
ko_nouns = okt.nouns(text) # 명사
ko_morph = okt.morphs(text) # 형태소
ko_morph_stem = okt.morphs(text, stem=True) # 어간
ko_phrase = okt.phrases(text) # 의미 단위
ko_pos = okt.pos(text) # part of speech

print(ko_nouns)
print(ko_morph)
print(ko_morph_stem)
print(ko_phrase)
print(ko_pos)
# print(type(ko_nouns))

['한글', '자연어', '처리', '이제']
['한글', '자연어', '처리', '는', '재미있다', '이제', '부터', '열심히', '해야지', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '는', '재미있다', '이제', '부터', '열심히', '하다', 'ㅎㅎㅎ']
['한글', '한글 자연어', '한글 자연어 처리', '이제', '자연어', '처리']
[('한글', 'Noun'), ('자연어', 'Noun'), ('처리', 'Noun'), ('는', 'Josa'), ('재미있다', 'Adjective'), ('이제', 'Noun'), ('부터', 'Josa'), ('열심히', 'Adverb'), ('해야지', 'Verb'), ('ㅎㅎㅎ', 'KoreanParticle')]


In [16]:
from konlpy.corpus import kolaw, kobill

In [19]:
# kolaw.open('constitution.txt').read()
kobill.open('1809890.txt').read() # 1809890.txt ~ 1809899.txt

'지방공무원법 일부개정법률안\n\n(정의화의원 대표발의 )\n\n 의 안\n 번 호\n\n9890\n\n발의연월일 : 2010.  11.  12.  \n\n발  의  자 : 정의화․이명수․김을동 \n\n이사철․여상규․안규백\n\n황영철․박영아․김정훈\n\n김학송 의원(10인)\n\n제안이유 및 주요내용\n\n  초등학교 저학년의 경우에도 부모의 따뜻한 사랑과 보살핌이 필요\n\n한 나이이나, 현재 공무원이 자녀를 양육하기 위하여 육아휴직을 할 \n\n수 있는 자녀의 나이는 만 6세 이하로 되어 있어 초등학교 저학년인 \n\n자녀를 돌보기 위해서는 해당 부모님은 일자리를 그만 두어야 하고 \n\n이는 곧 출산의욕을 저하시키는 문제로 이어질 수 있을 것임.\n\n  따라서 육아휴직이 가능한 자녀의 연령을 만 8세 이하로 개정하려\n\n는 것임(안 제63조제2항제4호).\n\n- 1 -\n\n\x0c법률  제        호\n\n지방공무원법 일부개정법률안\n\n지방공무원법 일부를 다음과 같이 개정한다.\n\n제63조제2항제4호 중 “만 6세 이하의 초등학교 취학 전 자녀를”을 “만 \n\n8세 이하(취학 중인 경우에는 초등학교 2학년 이하를 말한다)의 자녀를”\n\n로 한다.\n\n부      칙\n\n이 법은 공포한 날부터 시행한다.\n\n- 3 -\n\n\x0c신 ·구조문대비표\n\n현      행\n\n개   정   안\n\n제63조(휴직) ① (생  략)\n\n제63조(휴직) ① (현행과 같음)\n\n  ② 공무원이 다음 각 호의 어\n\n  ② -------------------------\n\n느 하나에 해당하는 사유로 휴\n\n----------------------------\n\n직을 원하면 임용권자는 휴직\n\n----------------------------\n\n을 명할 수 있다. 다만, 제4호\n\n-------------.---------------\n\n의 경우에는 대통령령으로 정\n\n---------------------------

## Re 문자열 정규식

In [43]:
a = ' Natural language '
# len(a)
a.count('la')
print(a.find('a'))

## join
print(','.join('kkkk'))

## uper/lower
print(a.lower())
print(a.upper())

print(a.replace('a', 'b'))

print(a.strip())
print(a.rstrip())
print(a.lstrip())

b = ['hello', 'nlp']
','.join(b)
print(a.split())

2
k,k,k,k
 natural language 
 NATURAL LANGUAGE 
 Nbturbl lbngubge 
Natural language
 Natural language
Natural language 
['Natural', 'language']


In [44]:
print('I eat %d apples' % 3)
print('I eat %s apples' % 5)
print('I eat %d apples on %s' % (5, 'friday'))

I eat 3 apples
I eat 5 apples
I eat 5 apples on friday


In [45]:
import re

In [48]:
## 메타문자 "." \n제외 모든문자 매칭

print(re.search('ab', 'aababc'))
print(re.search('a.b', 'aababc'))
index = re.search('a.b', 'aababc')
print(index.start())
print(index.end())

<re.Match object; span=(1, 3), match='ab'>
<re.Match object; span=(0, 3), match='aab'>
0
3


In [50]:
## 문자 클래스 [.]
print(re.search('a[.]b', 'aababc'))
print(re.search('a[.]b', 'a.babc'))

None
<re.Match object; span=(0, 3), match='a.b'>


In [53]:
## 메타문자 반복 *, 0 or 있거나
print(re.search('a*b', 'aababc'))
print(re.search('a*b', 'ababc'))
print(re.search('a*b', 'aaaaaababc'))
print(re.search('a*b', 'babc'))

<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 7), match='aaaaaab'>
<re.Match object; span=(0, 1), match='b'>


In [57]:
## 메타문자 반복 +, 1이상 반복
print(re.search('a+b', 'aabadc'))
print(re.search('a+b', 'abadc'))
print(re.search('a+b', 'aaaaaabadc'))
print(re.search('a+b', 'badc'))

<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 7), match='aaaaaab'>
None


In [61]:
## 반복 {m,n} --> m,n 띄어쓰기하면 안됨
print(re.search('a{2}b', 'aabadc'))
print(re.search('a{1}b', 'aabadc'))
print(re.search('a{1,3}b', 'aabadc'))
print(re.search('a{1,4}b', 'aaaabadc'))

<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(1, 3), match='ab'>
<re.Match object; span=(0, 3), match='aab'>
<re.Match object; span=(0, 5), match='aaaab'>


In [62]:
## 반복메타문자 ? 문자 {0,1}이상
print(re.search('a?b', 'aabadc'))
print(re.search('a?b', 'abadc'))
print(re.search('a?b', 'badc'))

<re.Match object; span=(1, 3), match='ab'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 1), match='b'>


In [63]:
p = re.compile('ab*')
print(type(p))

<class 're.Pattern'>


In [70]:
## match 문자 첫열만, search는 전체
p = re.compile('[a-z]+')
print(p.match('python'))
print(p.match('python').group())
print(p.match('python').start())
print(p.match('python').end())
print(p.match('pyThon'))

print(p.match('3pyThon'))
print(p.search('3pyThon'))

<re.Match object; span=(0, 6), match='python'>
python
0
6
<re.Match object; span=(0, 2), match='py'>
None
<re.Match object; span=(1, 3), match='py'>


In [71]:
p = re.compile('sk?t')
print(p.match('string goes here'))

<re.Match object; span=(0, 2), match='st'>


In [72]:
p = re.compile('[a-z]+')
print(p.findall('life is too short'))

['life', 'is', 'too', 'short']


In [73]:
result = p.finditer('life is too short')
print(result)

for i in result:
    print(i)

<callable_iterator object at 0x0000026B5768DA00>
<re.Match object; span=(0, 4), match='life'>
<re.Match object; span=(5, 7), match='is'>
<re.Match object; span=(8, 11), match='too'>
<re.Match object; span=(12, 17), match='short'>


In [76]:
p = re.compile('[a-z]+', re.I)
print(p.match('python'))
print(p.match('Python'))
print(p.match('PYTHON'))

<re.Match object; span=(0, 6), match='python'>
<re.Match object; span=(0, 6), match='Python'>
<re.Match object; span=(0, 6), match='PYTHON'>


In [81]:
p = re.compile('^python\s\w+')
# p = re.compile('python')
# p = re.compile('^python')

data = 'python is my life python'
print(p.findall(data))

['python is']


In [84]:
p = re.compile('abc|de')

print(p.search('bbabcde'))
print(p.search('bbaecde'))

<re.Match object; span=(2, 5), match='abc'>
<re.Match object; span=(5, 7), match='de'>


In [85]:
p = re.compile('life$')

print(p.search('life is too short'))
print(p.search('too short is my life'))

None
<re.Match object; span=(16, 20), match='life'>


In [86]:
p = re.compile('(ABC)+')

print(p.search('ABCABCABC OK?'))

<re.Match object; span=(0, 9), match='ABCABCABC'>


In [96]:
text = "park 010-1234-1234"
p = re.compile('^[a-zA-Z]+')
print(p.search(text))
# r = re.compile('[0-9]{2,3}-[0-9]{3,4}-[0-9]{4}')
r = re.compile('\d{2,3}-?\d{3,4}-?\d+')
print(r.search(text))

<re.Match object; span=(0, 4), match='park'>
<re.Match object; span=(5, 18), match='010-1234-1234'>


In [3]:
import keras_vggface
# print version
print(keras_vggface.__version__)

0.6


## 어제 Typing 연습

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

text_data = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부 해야지']

In [9]:
tfidf_vertorizer = TfidfVectorizer()
tfidf_vertorizer.fit(text_data)
tfidf_vector = tfidf_vertorizer(text_data)
print(tfidf_vector.toarray())

TypeError: 'TfidfVectorizer' object is not callable

In [10]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
sentence = "...."
word_token = word_tokenize(sentence)


In [11]:
from konlpy.tag import Okt

In [12]:
okt = Okt()

In [13]:
text = "한글 자연어 처리는 재미있다 이제부터 열심히 해야지 ㅎㅎㅎ"

ko_nouns = okt.nouns(text)
ko_morphs = okt.morphs(text)
ko_morphe_stem = okt.morphs(text, stem=True)
kp_phrases = okt.phrases(text)
ko_pos = okt.pos(text)