In [2]:
import numpy as np
import pandas as pd
from lxml import html
from itertools import izip, chain,islice

from passage.models import RNN
from passage.updates import Adadelta
from passage.updates import NAG, Regularizer
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import *

from sklearn import preprocessing
from sklearn.cross_validation import train_test_split


Using gpu device 0: GRID K520


In [3]:
class CharTokenize(Tokenizer):
    def __init__(self, max_features=9997, min_df=10, lowercase=True, character=False, charn=1):
        super(CharTokenize, self).__init__(max_features, min_df, lowercase, character)
        self.charn = charn
        
    def ntuples(self, lst, n):
        iters = izip(*[chain(islice(lst,i,None)) for i in range(n)])
        return [''.join(i) for i in iters]
    
    def fit(self, texts):
        if self.lowercase:
            texts = [text.lower() for text in texts]
        if self.character:
            tokens = [self.ntuples(list(text.decode("utf-8")), self.charn) for text in texts]
#             print tokens
        else:
            tokens = [tokenize(text) for text in texts]
        self.encoder = token_encoder(tokens, max_features=self.max_features-3, min_df=self.min_df)
        self.encoder['PAD'] = 0
        self.encoder['END'] = 1
        self.encoder['UNK'] = 2
        self.decoder = dict(zip(self.encoder.values(), self.encoder.keys()))
        self.n_features = len(self.encoder)
        return self
    
    def transform(self, texts):
        if self.lowercase:
            texts = [text.lower() for text in texts]
        if self.character:
            texts = [self.ntuples(list(text.decode("utf-8")), self.charn) for text in texts]
        else:
            texts = [tokenize(text) for text in texts]
        tokens = [[self.encoder.get(token, 2) for token in text] for text in texts]
        return tokens

In [4]:
trainFile = "../../train.txt"
testFile = "../../test.txt"
goldFile = "../../test-gold.txt"

In [5]:
tr_data = pd.read_csv(trainFile, encoding='utf-8', sep=r'\t+', header=None, names=['text', 'label'], engine='python')

In [6]:
grps = tr_data.groupby(['label'])

In [7]:
train_df = pd.DataFrame(columns=['text','label'])
validation_df = pd.DataFrame(columns=['text','label'])

for name, grp in grps:
    print name, grp.shape
    train, test = train_test_split(grp, test_size=0.1, random_state=42)

    train_df = train_df.append(train, ignore_index=True)
    validation_df = validation_df.append(test, ignore_index=True)

bg (18000, 2)
bs (18000, 2)
cz (18000, 2)
es-AR (18000, 2)
es-ES (18000, 2)
hr (18000, 2)
id (18000, 2)
mk (18000, 2)
my (18000, 2)
pt-BR (18000, 2)
pt-PT (18000, 2)
sk (18000, 2)
sr (18000, 2)
xx (18000, 2)


In [8]:
print train_df.shape, validation_df.shape

(226800, 2) (25200, 2)


In [9]:
trX = train_df['text'].values
trY = train_df['label'].values

In [10]:
le = preprocessing.LabelEncoder()
trY_t = le.fit_transform(trY)
print le.classes_

['bg' 'bs' 'cz' 'es-AR' 'es-ES' 'hr' 'id' 'mk' 'my' 'pt-BR' 'pt-PT' 'sk'
 'sr' 'xx']


In [None]:
tokenizer5 = CharTokenize(character=True, charn=5, min_df=2, max_features=1000000)
trX_t5 = tokenizer5.fit_transform(trX)

print tokenizer5.n_features

In [None]:
layers = [
    Embedding(size=128, n_features=tokenizer5.n_features),
    GatedRecurrent(size=768, p_drop=0.45),
    Dense(size=14, activation='softmax', p_drop=0.2)
]

In [19]:
model5 = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
model5.fit(trX_t5, trY_t, n_epochs=25)

Epoch 0 Seen 26944 samples Avg cost 2.6650 Time left 2365 seconds

KeyboardInterrupt: 