In [11]:
from nltk import bigrams, word_tokenize
from nltk.util import ngrams

t = 'I am a boy'
tk = word_tokenize(t)
bg = bigrams(tk)
ng = ngrams(tk, 3)
list(bg), list(ng)

([('I', 'am'), ('am', 'a'), ('a', 'boy')],
 [('I', 'am', 'a'), ('am', 'a', 'boy')])

SS<문장>SE

In [14]:
data2 = ngrams(tk, 2, pad_left = True, pad_right = True, left_pad_symbol = 'SS', right_pad_symbol = 'SE')
list(data2)

[('SS', 'I'), ('I', 'am'), ('am', 'a'), ('a', 'boy'), ('boy', 'SE')]

In [15]:
from nltk import ConditionalFreqDist

t = 'I am a boy'
tk = word_tokenize(t)
ng2 = ngrams(tk, 2, pad_left = True, pad_right = True, left_pad_symbol = 'SS', right_pad_symbol = 'SE')
fd = ConditionalFreqDist([(i[0], i[1]) for i in ng2])

In [16]:
fd.conditions()  # 정답 접근 정보

['SS', 'I', 'am', 'a', 'boy']

In [17]:
fd['SS']

FreqDist({'I': 1})

In [18]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\김철용\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\김철용\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
from nltk.corpus import movie_reviews

data_l = []
for i in movie_reviews.sents():
    bg2 = ngrams(i, 2, pad_left = True, pad_right = True, left_pad_symbol = 'SS', right_pad_symbol = 'SE')
    data_l += [t for t in bg2]
    
cfd = ConditionalFreqDist(data_l)

In [24]:
cfd['SS'].most_common(10)

[('the', 8071),
 ('.', 3173),
 ('it', 3136),
 ('i', 2471),
 ('but', 1814),
 ('and', 1735),
 ('he', 1672),
 ('in', 1659),
 ('this', 1651),
 ('there', 1298)]

In [25]:
cfd['the'].most_common(10)

[('film', 4542),
 ('movie', 2147),
 ('story', 985),
 ('most', 945),
 ('first', 902),
 ('same', 774),
 ('only', 665),
 ('end', 664),
 ('best', 642),
 ('audience', 620)]

In [26]:
cfd['movie'].most_common(10)

[('.', 802),
 (',', 659),
 ('is', 469),
 ("'", 234),
 ('that', 210),
 ('was', 123),
 ('and', 105),
 ('with', 87),
 ('to', 72),
 ('has', 69)]

In [27]:
cfd['.'].most_common(10)

[('SE', 63404), ('"', 1854), (')', 535), ("'", 70), (']', 10), ("''", 3)]

In [29]:
from nltk.probability import ConditionalProbDist, MLEProbDist

In [31]:
cpd = ConditionalProbDist(cfd, MLEProbDist)

In [33]:
cpd['the'].prob('movie')

0.0280547243528597

In [34]:
cpd['movie'].prob('.')

0.13897071564720154

In [35]:
cpd['.'].prob('the')

0.0

In [38]:
import numpy as np

def s_sc_f(x):  # 바이그램 언어 모형의 확률 연산 함수
    p = 0.0
    for i in range(len(x) - 1):
        c = x[i]  # 직전 단어
        w = x[i + 1]  # 이후 단어
        p += np.log(cpd[c].prob(w) + np.finfo(float).eps)
    return np.exp(p)

In [39]:
test_data = ['the', 'movie', '.']  # 실현 가능 문장
s_sc_f(test_data)

0.003898785120601922

In [40]:
test_data2 = ['movie', '.', 'the']  # 실현 불가능 문장
s_sc_f(test_data2)

3.085769765203191e-17

In [48]:
import random
random.seed(10)  # 시드값 조정하면 텍스트 생성 결과가 고정됨
cpd['SS'].generate()  # 임의의 텍스트 생성

'she'

In [49]:
cpd['she'].generate()

'and'

In [50]:
cpd['and'].generate()

'fine'

In [51]:
cpd['fine'].generate()

'effect'

In [52]:
cpd['effect'].generate()

';'

In [58]:
st = 'SS'
all_str = []
while True:
    random.seed(10)
    st = cpd[st].generate()
    all_str.append(st + ' ')
    if st == 'SE':
        all_str.pop()
        break
    
''.join(all_str)

"she wasn ' s first part of these guys catch a hard 2 is . "

---

In [None]:
from nltk.util import ngrams
from nltk import ConditionalFreqDist  # 문맥별 단어 빈도수 측정 클래스
from nltk.probability import ConditionalProbDist  # 조건부 확률 추정 클래스
from nltk.probability import MLEProbDist  # 최대 우도 추정값 클래스

### 1. 데이터 수집

In [60]:
from nltk.corpus import movie_reviews

data = movie_reviews.sents()

### 2. 데이터 전처리

In [None]:
data_l = []
for i in data:
    bg = ngrams(i, 2, pad_left = True, pad_right = True, left_pad_symbol = 'SS', right_pad_symbol = 'SE')
    data_l += [t for t in bg]

### 3. 모델 학습

In [None]:
cfd = ConditionalFreqDist(data_l)
cpd = ConditionalProbDist(cfd, MLEProbDist)

### 4. 검증(생략)

### 5. 동작

In [61]:
st = 'SS'
all_str = []
while True:
    random.seed(10)
    st = cpd[st].generate()
    all_str.append(st + ' ')
    if st == 'SE':
        all_str.pop()
        break
    
''.join(all_str)

"she wasn ' s first part of these guys catch a hard 2 is . "