In [1]:
import os
import sys
sys.path.insert(0, '..')


In [2]:
from preprocessing.save_cove_weights import save_cove_weights
from preprocessing.create_train_data import DataParser
from preprocessing.download_data import download_data
from preprocessing.download_fasttext_data import download_fasttext_data
from preprocessing.embedding_util import split_vocab_and_embedding
from preprocessing.s3_util import maybe_upload_data_files_to_s3
from flags import get_options_from_flags

In [3]:
import numpy as np
import operator
import os
import preprocessing.constants as constants
import preprocessing.chars as chars

In [4]:
import io

In [5]:
def _get_line_count_from_fasttext_vec(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    return n

In [None]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

In [6]:
input_file = os.path.join("../downloads", constants.FASTTEXT_VECTOR_FILE)
embedding_output_file = os.path.join("../data", constants.FASTTEXT_EMBEDDING_FILE)
vocab_output_file = os.path.join("../data", constants.VOCAB_FILE)
vocab_chars_output_file = os.path.join("../data", constants.VOCAB_CHARS_FILE)
if all([os.path.exists(f) for f in 
    [embedding_output_file, vocab_output_file, vocab_chars_output_file]]):
    print("Word embedding and vocab files already exist")
    #return
print("Creating NumPy word embedding file and vocab files")
num_lines = _get_line_count_from_fasttext_vec(input_file)
print("Vocab size: %d" % num_lines)


Creating NumPy word embedding file and vocab files
Vocab size: 879129


In [7]:
# Include 4 entries for bos/eos/unk/pad (they will all be left as 0 vectors).
embedding = np.zeros((num_lines + 4, constants.WORD_VEC_DIM_FASTTEXT), dtype=np.float32)
vocab_o_file = open(vocab_output_file, "w", encoding="utf-8")


In [8]:
embedding

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [9]:
embedding.shape

(879133, 300)

In [10]:
# Get IDs for the total vocab, not just the words. This includes
# the bos/eos/unk/pad.
vocab_chars = np.zeros((num_lines + 4, constants.MAX_WORD_LEN), dtype=np.uint8)


In [11]:
vocab_chars

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [12]:
vocab_chars.shape

(879133, 25)

In [13]:
input_file

'../downloads/fasttext.300d.txt'

In [14]:
def pad_or_truncate(some_list, target_len):
    return some_list[:target_len] + [0.0]*(target_len - len(some_list))

In [16]:
i_file = io.open(input_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
i = 0
char_counts = {}
vocab_list = []
for line in i_file:
    if i in [0, 879129, 879130, 879131, 879132]:
        i += 1
        continue
    tokens = line.rstrip().split(' ')
    word = tokens[0]
    vocab_list.append(word)
    for c in word:
        if c in char_counts:
            char_counts[c] += 1
        else:
            char_counts[c] = 1
    vocab_o_file.write(word + "\n")
    embedding[i] = np.fromiter(pad_or_truncate(tokens[1:], constants.WORD_VEC_DIM_FASTTEXT), dtype=np.float32)
    i += 1
    if i % 10000 == 0 or i == num_lines:
        print("Processed %d of %d (%f percent done)" % (i, num_lines, 100 * float(i) / float(num_lines)), end="\r")

Processed 879129 of 879129 (100.000000 percent done)

In [17]:
sorted_chars = sorted(char_counts.items(), key=operator.itemgetter(1),
    reverse=True)


In [18]:
sorted_chars

[('이', 89622),
 ('의', 64090),
 ('에', 63507),
 ('로', 50802),
 ('a', 50279),
 ('다', 50155),
 ('는', 48631),
 ('e', 48429),
 ('하', 48167),
 ('i', 39953),
 ('가', 39840),
 ('지', 38401),
 ('스', 38374),
 ('서', 38080),
 ('리', 36296),
 ('고', 35356),
 ('r', 34803),
 ('기', 34367),
 ('사', 34347),
 ('o', 34048),
 ('n', 34046),
 ('s', 33567),
 ('도', 31274),
 ('을', 30644),
 ('t', 29864),
 ('시', 29095),
 ('자', 27591),
 ('인', 26334),
 ('라', 25436),
 ('l', 24964),
 ('아', 24237),
 ('은', 24012),
 ('대', 23935),
 ('으', 23578),
 ('나', 22768),
 ('과', 21491),
 ('정', 21381),
 ('를', 21189),
 ('c', 21161),
 ('부', 20900),
 ('수', 20271),
 ('어', 18591),
 ('학', 18097),
 ('주', 18022),
 ('u', 17721),
 ('성', 17627),
 ('전', 17597),
 ('m', 17593),
 ('동', 17396),
 ('와', 17208),
 ('한', 17006),
 ('교', 16881),
 ('원', 16760),
 ('들', 16720),
 ('d', 16445),
 ('장', 16399),
 ('상', 15817),
 ('마', 15592),
 ('트', 15334),
 ('제', 15273),
 ('구', 14727),
 ('보', 14390),
 ('해', 14293),
 ('h', 14280),
 ('되', 14155),
 ('p', 14073),
 ('오', 135

In [19]:
len(sorted_chars)

9454

In [20]:
frequent_chars = dict((x[0], i) for i, x in enumerate(
    sorted_chars[:chars.MAX_CHARS]))

In [21]:
frequent_chars

{'a': 4,
 'b': 89,
 'c': 38,
 'd': 54,
 'e': 7,
 'f': 135,
 'g': 68,
 'h': 63,
 'i': 9,
 'k': 115,
 'l': 29,
 'm': 47,
 'n': 20,
 'o': 19,
 'p': 65,
 'r': 16,
 's': 21,
 't': 24,
 'u': 44,
 'v': 151,
 'w': 176,
 'y': 124,
 '가': 10,
 '각': 234,
 '간': 159,
 '감': 216,
 '강': 154,
 '개': 138,
 '거': 128,
 '건': 207,
 '게': 79,
 '격': 240,
 '결': 226,
 '경': 87,
 '계': 111,
 '고': 15,
 '공': 76,
 '과': 35,
 '관': 91,
 '광': 206,
 '교': 51,
 '구': 60,
 '국': 67,
 '군': 107,
 '권': 181,
 '그': 145,
 '금': 198,
 '기': 17,
 '김': 142,
 '까': 237,
 '나': 34,
 '남': 155,
 '내': 162,
 '네': 178,
 '노': 113,
 '는': 6,
 '니': 69,
 '다': 5,
 '단': 123,
 '당': 143,
 '대': 32,
 '던': 171,
 '데': 131,
 '도': 22,
 '독': 252,
 '동': 48,
 '되': 64,
 '된': 225,
 '두': 232,
 '드': 80,
 '들': 53,
 '등': 148,
 '디': 168,
 '라': 28,
 '란': 249,
 '래': 239,
 '러': 184,
 '레': 97,
 '려': 169,
 '력': 213,
 '로': 3,
 '론': 241,
 '루': 146,
 '류': 227,
 '르': 70,
 '를': 37,
 '리': 14,
 '립': 244,
 '마': 57,
 '만': 83,
 '매': 229,
 '메': 195,
 '며': 108,
 '면': 85,
 '명': 129,
 '모': 11

In [22]:
chars.MAX_CHARS

256

In [23]:
len(frequent_chars)

256

In [24]:
chars.CHAR_BOW_ID

258

In [25]:
chars.CHAR_PAD_ID

260

In [26]:
chars.CHAR_UNK_ID

261

In [27]:
chars.CHAR_EOW_ID

259

In [28]:
constants.MAX_WORD_LEN

25

In [29]:
vocab_list

['</s>',
 '.',
 ',',
 ')',
 '(',
 '년',
 "'",
 '-',
 '분류',
 '월',
 '일',
 '#',
 '}',
 '있다',
 '/',
 '~',
 '이',
 '《',
 '》',
 '는',
 '수',
 '제',
 '의',
 '넘겨주기',
 '은',
 '·',
 '있는',
 '그',
 '역',
 'kst',
 '대한민국의',
 '\\',
 '에',
 '토론',
 '선수',
 '바깥',
 '고리',
 '%',
 '한',
 '및',
 '를',
 '?',
 '축구',
 '한다',
 'the',
 '대한',
 '영화',
 'a',
 '을',
 '주',
 '가',
 '명',
 '년에',
 '다른',
 '같은',
 '로',
 '되었다',
 'm',
 '등',
 '회',
 'of',
 '이후',
 '중',
 '그는',
 '미국의',
 '함께',
 '때',
 '또한',
 '에서',
 '현재',
 '때문에',
 '같이',
 '대',
 '후',
 '!',
 '사람',
 '위해',
 '"',
 '것을',
 '더',
 '배우',
 '시',
 '일본',
 '대한민국',
 '태어남',
 '→',
 '또는',
 '그리고',
 '두',
 '하는',
 'kbs',
 '와',
 '현',
 '미국',
 '그러나',
 's',
 '된다',
 '가장',
 '동문',
 '그의',
 '세',
 '있으며',
 'b',
 '따라',
 '것이',
 '‘',
 '것으로',
 '〈',
 '〉',
 '의해',
 '보기',
 '이다',
 '파일',
 '으로',
 '일본의',
 '많은',
 '전',
 '개',
 '것은',
 '올림픽',
 '다시',
 '했다',
 '+',
 '출신',
 '있었다',
 '할',
 'f',
 'tv',
 '과',
 '첫',
 '당시',
 'd',
 '–',
 '모든',
 'jpg',
 '경우',
 'mbc',
 'x',
 'c',
 '것이다',
 '살아있는',
 'and',
 '리그',
 '통해',
 '개의',
 'km',
 '일에',
 '것',
 '차'

In [30]:
len(vocab_list)

879128

In [31]:
print("Creating word character data")
for z in range(len(vocab_list)):
    word = vocab_list[z]
    vocab_chars[z, 0] = chars.CHAR_BOW_ID
    for zz in range(constants.MAX_WORD_LEN - 1):
        insert_index = zz + 1
        if zz >= len(word):
            vocab_chars[z, insert_index] = chars.CHAR_PAD_ID
        elif word[zz] not in frequent_chars:
            vocab_chars[z, insert_index] = chars.CHAR_UNK_ID
        else:
            vocab_chars[z, insert_index] = frequent_chars[word[zz]]
    vocab_chars[z, min(1 + len(word), constants.MAX_WORD_LEN - 1)] = \
        chars.CHAR_EOW_ID

Creating word character data


In [32]:
# The order of the following must match that of vocab.py
vocab_chars[num_lines, :] = chars.CHAR_BOS_ID
vocab_chars[num_lines + 1, :] = chars.CHAR_EOS_ID
vocab_chars[num_lines + 2, :] = chars.CHAR_PAD_ID
vocab_chars[num_lines + 3, :] = chars.CHAR_UNK_ID

In [33]:
vocab_chars

array([[2, 5, 5, ..., 4, 4, 4],
       [2, 5, 3, ..., 4, 4, 4],
       [2, 5, 3, ..., 4, 4, 4],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [4, 4, 4, ..., 4, 4, 4],
       [5, 5, 5, ..., 5, 5, 5]], dtype=uint8)

In [34]:
vocab_chars.shape

(879133, 25)

In [35]:
vocab_chars.size

21978325

In [36]:
np.save(vocab_chars_output_file, vocab_chars)
np.save(embedding_output_file, embedding)
vocab_o_file.close()
i_file.close()
print("")
print("Finished creating vocabulary and embedding file")


Finished creating vocabulary and embedding file


In [37]:
vocab_chars_output_file

'../data/vocab.chars.npy'

In [38]:
embedding_output_file

'../data/fasttext.embedding.npy'