In [8]:
import numpy as np
import pandas as pd
from lxml import html
from itertools import izip, chain,islice

from passage.models import RNN
from passage.updates import Adadelta
from passage.updates import NAG, Regularizer
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import *

from sklearn import preprocessing

In [9]:
class CharTokenize(Tokenizer):
    def __init__(self, max_features=9997, min_df=10, lowercase=True, character=False, charn=1):
        super(CharTokenize, self).__init__(max_features, min_df, lowercase, character)
        self.charn = charn
        
    def ntuples(self, lst, n):
        iters = izip(*[chain(islice(lst,i,None)) for i in range(n)])
        return [''.join(i) for i in iters]
    
    def fit(self, texts):
        if self.lowercase:
            texts = [text.lower() for text in texts]
        if self.character:
            tokens = [self.ntuples(list(text.decode("utf-8")), self.charn) for text in texts]
#             print tokens
        else:
            tokens = [tokenize(text) for text in texts]
        self.encoder = token_encoder(tokens, max_features=self.max_features-3, min_df=self.min_df)
        self.encoder['PAD'] = 0
        self.encoder['END'] = 1
        self.encoder['UNK'] = 2
        self.decoder = dict(zip(self.encoder.values(), self.encoder.keys()))
        self.n_features = len(self.encoder)
        return self
    
    def transform(self, texts):
        if self.lowercase:
            texts = [text.lower() for text in texts]
        if self.character:
            texts = [self.ntuples(list(text.decode("utf-8")), self.charn) for text in texts]
        else:
            texts = [tokenize(text) for text in texts]
        tokens = [[self.encoder.get(token, 2) for token in text] for text in texts]
        return tokens

In [2]:
trainFile = "../../train.txt"
testFile = "../../test.txt"
goldFile = "../../test-gold.txt"

In [10]:
tr_data = pd.read_csv(trainFile, encoding='utf-8', sep=r'\t+', header=None, names=['text', 'label'])
trX = tr_data['text'].values
trY = tr_data['label'].values

  if __name__ == '__main__':


In [11]:
tr_data['text'].shape

(252000,)

In [12]:
le = preprocessing.LabelEncoder()
trY_t = le.fit_transform(trY)
le.classes_
# trY_t.dtype = np.uint8

array(['bg', 'bs', 'cz', 'es-AR', 'es-ES', 'hr', 'id', 'mk', 'my', 'pt-BR',
       'pt-PT', 'sk', 'sr', 'xx'], dtype=object)

In [None]:
tokenizer = CharTokenize(character=True, charn=4, min_df=2, max_features=1000000)
trX_t = tokenizer.fit_transform(trX)
print("Training data tokenized.")

In [None]:
tokenizer.n_features

In [None]:
te_data = pd.read_csv(testFile, encoding='utf-8', sep=r'\t+', header=None, names=['text'])
teX = te_data['text'].values
teX_t = tokenizer.transform(teX)

In [None]:
le.classes_.shape

In [12]:
layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, p_drop=0.4),
    Dense(size=14, activation='softmax', p_drop=0.2)
]

model = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
model.fit(trX_t, trY_t, n_epochs=10)

Epoch 0 Seen 249634 samples Avg cost 0.3268 Time elapsed 479 seconds
Epoch 1 Seen 499268 samples Avg cost 0.1723 Time elapsed 960 seconds
Epoch 2 Seen 748902 samples Avg cost 0.1182 Time elapsed 1440 seconds
Epoch 3 Seen 998536 samples Avg cost 0.0731 Time elapsed 1919 seconds
Epoch 4 Seen 1248170 samples Avg cost 0.0489 Time elapsed 2399 seconds
Epoch 5 Seen 1497804 samples Avg cost 0.0324 Time elapsed 2878 seconds
Epoch 6 Seen 1747438 samples Avg cost 0.0178 Time elapsed 3357 seconds
Epoch 7 Seen 1997072 samples Avg cost 0.0112 Time elapsed 3837 seconds
Epoch 8 Seen 2246706 samples Avg cost 0.0069 Time elapsed 4316 seconds
Epoch 9 Seen 2496340 samples Avg cost 0.0032 Time elapsed 4795 seconds


[array(2.6361067295074463, dtype=float32),
 array(2.636536121368408, dtype=float32),
 array(2.6347291469573975, dtype=float32),
 array(2.638124465942383, dtype=float32),
 array(2.6707448959350586, dtype=float32),
 array(2.6431689262390137, dtype=float32),
 array(2.6407206058502197, dtype=float32),
 array(2.635986804962158, dtype=float32),
 array(2.6389098167419434, dtype=float32),
 array(2.643094062805176, dtype=float32),
 array(2.6400084495544434, dtype=float32),
 array(2.6362595558166504, dtype=float32),
 array(2.6432933807373047, dtype=float32),
 array(2.649150848388672, dtype=float32),
 array(2.6480636596679688, dtype=float32),
 array(2.6455636024475098, dtype=float32),
 array(2.634331464767456, dtype=float32),
 array(2.6427783966064453, dtype=float32),
 array(2.6387321949005127, dtype=float32),
 array(2.6461181640625, dtype=float32),
 array(2.642733097076416, dtype=float32),
 array(2.6334338188171387, dtype=float32),
 array(2.6323883533477783, dtype=float32),
 array(2.637357234954

In [13]:
pr_trX = model.predict(trX_t)

In [14]:
print(np.mean(trY_t == np.argmax(pr_trX, axis=1)))

0.999202380952


In [None]:
pr_teX = model.predict(teX_t)

In [None]:
classes = np.argmax(pr_teX, axis=1)

In [None]:
te_data['classes'] = le.inverse_transform(classes)
te_data.head()

In [None]:
gold_output = 'RNN_LSTM_1L.txt'
te_data.to_csv(gold_output, sep='\t', index=False, header=None)

In [None]:
!python evaluate.py RNN_LSTM_1L.txt test-gold.txt