In [9]:
import numpy as np
import csv
import nltk
import itertools
from datetime import datetime
import rnn_with_numpy as rnn

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/anderson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
SENTENCE_START = '_SETENCE_START'
SENTENCE_END = '_SENTENCE_END'
UNKNOWN_TOKEN = '_UNKNOWN_TOKEN'
VOCAB_SIZE = 8000

In [12]:
with open('/dataset/reddit_comments/reddit_comments_small.csv', 'rt') as f:
    t = datetime.now()
    reader = csv.reader(f, skipinitialspace=True)
    
    # Split full comments into sentences
    sentences = itertools.chain(
        *[nltk.sent_tokenize(x[0].lower()) for x in reader])
    print('Split Done')
    
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (SENTENCE_START, x, SENTENCE_END)
                    for x in sentences ]
    print('Finished: ', (datetime.now() - t).total_seconds())

Split Done
Finished:  0.563012


In [13]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(st) for st in sentences ]

In [14]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("%d unique words tokens." % len(word_freq.items()))

17196 unique words tokens.


### Word Indexing

아래 2개의 함수를 사용해서 word -> index 또는 그 반대로 변환을 해 줄수 있습니다.

* index_to_word(index) : list -> word
* word_to_index(word) : dict -> integer

In [16]:
most_common_freq = word_freq.most_common(VOCAB_SIZE-1)
index_to_word = [x[0] for x in most_common_freq]
index_to_word.append(UNKNOWN_TOKEN)
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

print("VOCAB_SIZE: %d" % VOCAB_SIZE)
print("Unknown token Index:", word_to_index[UNKNOWN_TOKEN])
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % \
          (most_common_freq[-1][0], most_common_freq[-1][1]))

VOCAB_SIZE: 8000
Unknown token Index: 7999
The least frequent word in our vocabulary is 'indicating' and appeared 1 times.


### Make Unknown Tokens

링크, 이상한 단어들.. 등등. 모든 단어들을 모두 외우고 있을수는 없습니다.<br>
따라서 가장 많이 나온 단어들 (word_freq.most_common(VOCAB_SIZE-1)) 을 제외하고, 그 외 단어들은 UNKNOWN_TOKEN으로 대체합니다.

In [8]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else UNKNOWN_TOKEN for w in sent]

### Train Data

In [9]:
# Create the training data
x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

print(x_train[1])
print(y_train[1])

[0, 67, 510, 68, 35, 91, 20, 7266, 32, 12, 3006, 34]
[67, 510, 68, 35, 91, 20, 7266, 32, 12, 3006, 34, 1]


### Model

In [10]:
model = rnn.RNNNumpy(VOCAB_SIZE)
o, s = model.forward_propagation(x_train[9])
p = model.predict(x_train[9])

In [11]:
print('error cost:', model.cross_entropy(x_train[26], y_train[26]))
model.bptt(x_train[26], y_train[26])
model.calculate_gradients(x_train[26], y_train[26])

error cost: 8.9873874381


### Train!

In [12]:
model.train(x_train, y_train, npoch=len(x_train))

Start Training
Total Data:  11587


In [13]:
def convert_idx_to_sentence(index):
    return ' '.join([index_to_word[i] for i in index])

def convert_sentence_to_idx(sentence):
    return  [ word_to_index[w] for w in nltk.word_tokenize(sentence)]

In [24]:
test1 = convert_sentence_to_idx(SENTENCE_START+ " here in germany i know kik as")
result1 = model.predict(test1)
print('result1:', result1)
print(convert_idx_to_sentence(test1))
print(convert_idx_to_sentence(result1))

result1: [   7 7999 7999 7999 7999 7999 7999 7999]
_SETENCE_START here in germany i know kik as
i _UNKNOWN_TOKEN _UNKNOWN_TOKEN _UNKNOWN_TOKEN _UNKNOWN_TOKEN _UNKNOWN_TOKEN _UNKNOWN_TOKEN _UNKNOWN_TOKEN
