# Poom's C Char-RNN Langauge Detector
---

In [2]:
import re
import os
import random
import numpy as np
import numpy.random as rd
import tensorflow as tf

In [3]:
def onehot(index, length):
    """
    takes
        two integers
    returns
        a onehot vector of length 'length'
        with one at index 'index'
    """
    vector = np.zeros(length)
    vector[index] = 1.0
    return vector

def reader(directory):
    """
    Takes 
        - directory : string
            the directory path of all the text files
    Returns 
        - dict[string -> list[string]]
            a dictionary whose keys are the languages (folder-name)
            and values are list of strings in the files in the folder
    """
    def parse(base_name, files):
        print("parsing files in", base_name)
        pattern = re.compile('<[^>]*>')
        strings = []
        for name in files:
            with open(base_name + "/"+ name, errors='replace') as f:
                strings.append(re.sub(pattern, '', f.read()))
        return strings

    def main(directory):
        scanner = os.walk(directory)
        _, folders, _ = next(scanner)
        return {name: parse(name, files)
                for name, _, files in scanner}
    
    return main(directory)

Then we parse the files. This will take sometime. Go get coffee or something.

In [4]:
strings = reader("./text")
joined = { lang: "`".join(strings[lang])
            for lang in strings }
charset = { lang: set(joined[lang]) for lang in joined }
languages = sorted(charset)

parsing files in ./text/bg
parsing files in ./text/cs
parsing files in ./text/da
parsing files in ./text/de
parsing files in ./text/el
parsing files in ./text/en
parsing files in ./text/es
parsing files in ./text/et
parsing files in ./text/fi
parsing files in ./text/fr
parsing files in ./text/hu
parsing files in ./text/it
parsing files in ./text/lt
parsing files in ./text/lv
parsing files in ./text/nl
parsing files in ./text/pl
parsing files in ./text/pt
parsing files in ./text/ro
parsing files in ./text/sk
parsing files in ./text/sl
parsing files in ./text/sv


In [5]:
def stream(languages, joined, window_size, char_to_ind):
    
    def next_point():
        index = random.randrange(len(languages))
        string = joined[languages[index]]
        start = random.randrange(0, len(string) - window_size)
        substring = string[start:start+window_size]
        labels = [index] * window_size
        inputs = np.array([char_to_ind[c] for c in substring])
        return labels, inputs

    def iterate(batch_size):
        while True:
            data = [next_point() for _ in range(batch_size)]
            labels, inputs = zip(*data)
            yield np.array(labels), np.array(inputs)

    return iterate

In [10]:
num_chars = sum(len(text) for text in joined.values())
num_files = sum(len(strings[lang]) for lang in strings)
print(num_chars, "characters in", num_files, "files")

4499268584 characters in 187072 files


In [11]:
unique_char = set()
for lang in charset:
    unique_char.update(charset[lang])

unique_char = sorted(unique_char)
num_unique_chars = len(unique_char)
char_to_ind = { c : i for i, c in enumerate(unique_char) }
char_to_vec = { c : onehot(i, len(unique_char)) 
                 for i, c in enumerate(unique_char)}
print(num_unique_chars, "unique characters")

496 unique characters


We use a standard Char-RNN classifier for this problem. That is, we embed characters into $\mathbb{R}^n$, feed the sequence of embedded points into an RNN and have each output be measured against the one-hot language vector using KL divergence.

Since the data size is quite big relative to the batch size and rnn_size is small, it's not necessary to use dropout. But since random english appears in russian files, adding dropout will help with noisy labels. We also implemented https://arxiv.org/pdf/1705.03419.pdf to deal with this problem of noisy labels.

In [201]:
rnn_size = 40
num_layers = 2
batch_size = 13
window_size = 100
output_size = 21

tf.reset_default_graph()

In [202]:
char_ids = tf.placeholder(tf.int64, [batch_size, window_size])
labels   = tf.placeholder(tf.int64, [batch_size, window_size])
pkeep    = tf.placeholder(tf.float64)

In [203]:
weights = np.linspace(0, 1, window_size) ** 0.5

In [204]:
# xavier initialization
embedding = rd.randn(num_unique_chars, rnn_size)
embedding = tf.Variable(embedding / np.sqrt(rnn_size + rnn_size))
inp = tf.nn.embedding_lookup(embedding, char_ids)

# xavier initialization
noise_prob = tf.Variable(np.eye(output_size))
decoder = rd.randn(rnn_size, output_size)
decoder = tf.Variable(decoder / np.sqrt(rnn_size + output_size))
bias    = tf.zeros(output_size, tf.float64)


In [205]:
lstm_cell = lambda size: (
    tf.nn.rnn_cell.DropoutWrapper
   (tf.nn.rnn_cell.LSTMCell(size), pkeep))
lstm_cell = tf.nn.rnn_cell.LSTMCell
cells = [lstm_cell(rnn_size) for _ in range(num_layers)]
lstm = tf.nn.rnn_cell.MultiRNNCell(cells)

In [206]:
state = lstm.zero_state(batch_size, tf.float64)
outputs = []
for i in range(window_size):
    output, state = lstm(inp[:,i,:], state)
    unscaled_logit = tf.matmul(output, decoder) + bias
    unscaled_logit = tf.nn.softmax(unscaled_logit)
    # see https://arxiv.org/pdf/1705.03419.pdf
    denoiser = tf.matmul(unscaled_logit, noise_prob)
    outputs.append(denoiser)
    
outputs = tf.stack(outputs, axis=1)

In [207]:
len_index = -1
prediction = tf.argmax(outputs[:, len_index], axis=1),
accuracy = tf.reduce_sum(tf.cast(tf.equal(prediction, labels[:, len_index]), tf.int64))

In [208]:
longname = tf.nn.sparse_softmax_cross_entropy_with_logits # lol
loss_matrix = longname(logits=outputs, labels=labels)
total_loss = tf.reduce_mean(weights * loss_matrix)

optimizer = tf.train.AdamOptimizer()
minimizer = optimizer.minimize(total_loss)

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
cumulative = 0.35
datastream = stream(languages, joined, window_size, char_to_ind)(batch_size)
for i in range(50000):
    y, x = next(datastream)
    acc, _ = sess.run([accuracy, minimizer], feed_dict = { char_ids:x, labels:y, pkeep:0.5 })
    cumulative *= 0.99
    cumulative += 0.01 * acc / batch_size
    if i % 200 == 0:
        print("iteration", i, "with decaying accuracy", cumulative)

iteration 0 with accuracy 0.3465
iteration 200 with accuracy 0.193959385325
iteration 400 with accuracy 0.219794536498
iteration 600 with accuracy 0.241795761679
iteration 800 with accuracy 0.260827810797
iteration 1000 with accuracy 0.260836844854
iteration 1200 with accuracy 0.249975477763
iteration 1400 with accuracy 0.282224080663
iteration 1600 with accuracy 0.290690208435
iteration 1800 with accuracy 0.309968273729
iteration 2000 with accuracy 0.328351172071
iteration 2200 with accuracy 0.328672528058
iteration 2400 with accuracy 0.334856344627
iteration 2600 with accuracy 0.334587778115
iteration 2800 with accuracy 0.369054941944
iteration 3000 with accuracy 0.377971751971
iteration 3200 with accuracy 0.392350487538
iteration 3400 with accuracy 0.374985276068
iteration 3600 with accuracy 0.37157121763
iteration 3800 with accuracy 0.373034959424
iteration 4000 with accuracy 0.384359196206
iteration 4200 with accuracy 0.379678644765
iteration 4400 with accuracy 0.389084105014
iter

iteration 36800 with accuracy 0.975074473643
iteration 37000 with accuracy 0.969348762437
iteration 37200 with accuracy 0.978024989382
iteration 37400 with accuracy 0.971303742603
iteration 37600 with accuracy 0.975863407537
iteration 37800 with accuracy 0.97913879482
iteration 38000 with accuracy 0.977636945738
iteration 38200 with accuracy 0.975222032072
iteration 38400 with accuracy 0.978903515074
iteration 38600 with accuracy 0.985686057062
iteration 38800 with accuracy 0.981980617502
iteration 39000 with accuracy 0.974886563419
iteration 39200 with accuracy 0.975182586038
iteration 39400 with accuracy 0.979315209573
iteration 39600 with accuracy 0.980977929781
iteration 39800 with accuracy 0.984154268067
iteration 40000 with accuracy 0.980057084959
iteration 40200 with accuracy 0.978767113446
iteration 40400 with accuracy 0.974359131256
iteration 40600 with accuracy 0.979156882611
iteration 40800 with accuracy 0.981158152787
iteration 41000 with accuracy 0.981840583148
iteration 4

In [None]:
for i in range(5000):
    y, x = next(datastream)
    [acc] = sess.run([accuracy], feed_dict = { char_ids:x, labels:y, pkeep:1.0 })
    cumulative += acc / batch_size
    if i % 200 == 0:
        print("iteration", i, "with decaying accuracy", cumulative / (i + 1))