In [1]:
import numpy as np
import pandas as pd
from lxml import html
from itertools import izip, chain,islice

from passage.models import RNN
from passage.updates import Adadelta
from passage.updates import NAG, Regularizer
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import *

from sklearn import preprocessing

Using gpu device 0: GRID K520


In [2]:
class CharTokenize(Tokenizer):
    def __init__(self, max_features=9997, min_df=10, lowercase=True, character=False, charn=1):
        super(CharTokenize, self).__init__(max_features, min_df, lowercase, character)
        self.charn = charn
        
    def ntuples(self, lst, n):
        iters = izip(*[chain(islice(lst,i,None)) for i in range(n)])
        return [''.join(i) for i in iters]
    
    def fit(self, texts):
        if self.lowercase:
            texts = [text.lower() for text in texts]
        if self.character:
            tokens = [self.ntuples(list(text.decode("utf-8")), self.charn) for text in texts]
#             print tokens
        else:
            tokens = [tokenize(text) for text in texts]
        self.encoder = token_encoder(tokens, max_features=self.max_features-3, min_df=self.min_df)
        self.encoder['PAD'] = 0
        self.encoder['END'] = 1
        self.encoder['UNK'] = 2
        self.decoder = dict(zip(self.encoder.values(), self.encoder.keys()))
        self.n_features = len(self.encoder)
        return self
    
    def transform(self, texts):
        if self.lowercase:
            texts = [text.lower() for text in texts]
        if self.character:
            texts = [self.ntuples(list(text.decode("utf-8")), self.charn) for text in texts]
        else:
            texts = [tokenize(text) for text in texts]
        tokens = [[self.encoder.get(token, 2) for token in text] for text in texts]
        return tokens

In [3]:
trainFile = "../../train.txt"
testFile = "../../test.txt"
goldFile = "../../test-gold.txt"

In [4]:
tr_data = pd.read_csv(trainFile, encoding='utf-8', sep=r'\t+', header=None, names=['text', 'label'])
trX = tr_data['text'].values
trY = tr_data['label'].values

  if __name__ == '__main__':


In [5]:
tr_data['text'].shape

(252000,)

In [6]:
le = preprocessing.LabelEncoder()
trY_t = le.fit_transform(trY)
le.classes_
# trY_t.dtype = np.uint8

array(['bg', 'bs', 'cz', 'es-AR', 'es-ES', 'hr', 'id', 'mk', 'my', 'pt-BR',
       'pt-PT', 'sk', 'sr', 'xx'], dtype=object)

In [13]:
tokenizer = CharTokenize(character=True, charn=4, min_df=2, max_features=1000000)
trX_t = tokenizer.fit_transform(trX)
print("Training data tokenized.")

Training data tokenized.


In [14]:
tokenizer.n_features

372308

In [37]:
te_data = pd.read_csv(testFile, encoding='utf-8', sep=r'\t+', header=None, names=['text'])
teX = te_data['text'].values
# teX_t = tokenizer.transform(teX)

  if __name__ == '__main__':


In [18]:
le.classes_.shape

(14,)

In [7]:
# layers = [
#     Embedding(size=128, n_features=tokenizer.n_features),
#     GatedRecurrent(size=512, p_drop=0.4),
#     Dense(size=14, activation='softmax', p_drop=0.2)
# ]

# model = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
# model.fit(trX_t, trY_t, n_epochs=10)

In [20]:
pr_trX = model.predict(trX_t)
print(np.mean(trY_t == np.argmax(pr_trX, axis=1)))

pr_teX = model.predict(teX_t)
classes = np.argmax(pr_teX, axis=1)

te_data['classes'] = le.inverse_transform(classes)
te_data.head()

gold_output = 'RNN_LSTM_1L.txt'
te_data.to_csv(gold_output, sep='\t', index=False, header=None)

0.993869047619


In [30]:
!python ../../evaluate.py RNN_LSTM_1L.txt ../../test-gold.txt

=== Results === 

Portugese
pt-BR: 848 / 1000 = 0.848
pt-PT: 943 / 1000 = 0.943

Bulgarian, Macedonian
bg: 1000 / 1000 = 1.0
mk: 998 / 1000 = 0.998

Spanish
es-ES: 855 / 1000 = 0.855
es-AR: 889 / 1000 = 0.889

Bosnian, Croatian, Serbian
bs: 807 / 1000 = 0.807
hr: 913 / 1000 = 0.913
sr: 921 / 1000 = 0.921

Malay, Indo
my: 981 / 1000 = 0.981
id: 984 / 1000 = 0.984

Czech, Slovak
cz: 983 / 1000 = 0.983
sk: 1000 / 1000 = 1.0

Others
xx: 992 / 1000 = 0.992

Overall: 13114 / 14000 = 0.936714285714



##3 grams

In [22]:
tokenizer3 = CharTokenize(character=True, charn=3, min_df=1, max_features=1000000)
trX_t3 = tokenizer3.fit_transform(trX)
teX_t3 = tokenizer3.transform(teX)

In [None]:
layers = [
    Embedding(size=128, n_features=tokenizer3.n_features),
    GatedRecurrent(size=512, p_drop=0.4),
    Dense(size=14, activation='softmax', p_drop=0.2)
]

model3 = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
model3.fit(trX_t3, trY_t, n_epochs=10)

Epoch 0 Seen 5184 samples Avg cost 2.6376 Time left 1536 seconds

In [None]:
pr_trX3 = model3.predict(trX_t3)
print(np.mean(trY_t == np.argmax(pr_trX3, axis=1)))

pr_teX3 = model3.predict(teX_t3)
classes = np.argmax(pr_teX3, axis=1)

te_data['classes'] = le.inverse_transform(classes)
te_data.head()

gold_output = 'RNN_LSTM_C3G.txt'
te_data.to_csv(gold_output, sep='\t', index=False, header=None)

In [29]:
!python ../../evaluate.py RNN_LSTM_C3G.txt ../../test-gold.txt

=== Results === 

Portugese
pt-BR: 903 / 1000 = 0.903
pt-PT: 882 / 1000 = 0.882

Bulgarian, Macedonian
bg: 1000 / 1000 = 1.0
mk: 1000 / 1000 = 1.0

Spanish
es-ES: 830 / 1000 = 0.83
es-AR: 902 / 1000 = 0.902

Bosnian, Croatian, Serbian
bs: 837 / 1000 = 0.837
hr: 850 / 1000 = 0.85
sr: 930 / 1000 = 0.93

Malay, Indo
my: 946 / 1000 = 0.946
id: 985 / 1000 = 0.985

Czech, Slovak
cz: 997 / 1000 = 0.997
sk: 995 / 1000 = 0.995

Others
xx: 998 / 1000 = 0.998

Overall: 13055 / 14000 = 0.9325



##2gram

In [47]:
tokenizer2 = CharTokenize(character=True, charn=2, min_df=1, max_features=1000000)
trX_t2 = tokenizer2.fit_transform(trX)
teX_t2 = tokenizer2.transform(teX)

print tokenizer2.n_features

In [None]:
layers = [
    Embedding(size=128, n_features=tokenizer2.n_features),
    GatedRecurrent(size=512, p_drop=0.4),
    Dense(size=14, activation='softmax', p_drop=0.2)
]

model2 = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
model2.fit(trX_t2, trY_t, n_epochs=10)

Epoch 0 Seen 47424 samples Avg cost 1.3115 Time left 1271 seconds

In [None]:
pr_trX2 = model2.predict(trX_t2)
print(np.mean(trY_t == np.argmax(pr_trX2, axis=1)))

pr_teX2 = model2.predict(teX_t2)
classes = np.argmax(pr_teX2, axis=1)

te_data['classes'] = le.inverse_transform(classes)
te_data.head()

gold_output = 'RNN_LSTM_C2G.txt'
te_data.to_csv(gold_output, sep='\t', index=False, header=None)

In [None]:
!python ../../evaluate.py RNN_LSTM_C2G.txt ../../test-gold.txt

##5 grams

In [None]:
tokenizer5 = CharTokenize(character=True, charn=5, min_df=2, max_features=1000000)
trX_t5 = tokenizer5.fit_transform(trX)
teX_t5 = tokenizer5.transform(teX)

print tokenizer5.n_features

In [None]:
layers = [
    Embedding(size=128, n_features=tokenizer5.n_features),
    GatedRecurrent(size=512, p_drop=0.4),
    Dense(size=14, activation='softmax', p_drop=0.2)
]

model5 = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
model5.fit(trX_t5, trY_t, n_epochs=20)

Epoch 0 Seen 249520 samples Avg cost 0.4232 Time elapsed 1861 seconds
Epoch 1 Seen 499040 samples Avg cost 0.2166 Time elapsed 3724 seconds
Epoch 2 Seen 500896 samples Avg cost 0.1589 Time left 1848 seconds

In [None]:
pr_trX5 = model5.predict(trX_t5)
print(np.mean(trY_t == np.argmax(pr_trX5, axis=1)))

pr_teX5 = model5.predict(teX_t5)
classes = np.argmax(pr_teX5, axis=1)

te_data['classes'] = le.inverse_transform(classes)
te_data.head()

gold_output = 'RNN_LSTM_C5G.txt'
te_data.to_csv(gold_output, sep='\t', index=False, header=None)


In [None]:
!python ../../evaluate.py RNN_LSTM_C5G.txt ../../test-gold.txt

In [16]:
def getY(fl):
    data = pd.read_csv(fl, encoding='utf-8', sep=r'\t+', header=None, names=['text', 'label'])
    trY = data['label'].values
    return trY

In [12]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [17]:
# classes4 = np.argmax(pr_teX, axis=1)

# classes3 = np.argmax(pr_teX3, axis=1)

# classes2 = np.argmax(pr_teX2, axis=1)

In [22]:
c2g = getY('RNN_LSTM_C2G.txt')
c3g = getY('RNN_LSTM_C3G.txt')
c4g = getY('RNN_LSTM_1L.txt')
c5g = getY('RNN_LSTM_C5G.txt')

  from ipykernel import kernelapp as app


In [61]:
preds = zip(c5g, c5g, c3g, c4g, c2g)

In [62]:
preds = map(lambda x: most_common(x), preds)

In [63]:
te_data['classes'] = preds

gold_output = 'RNN_LSTM_ensemble.txt'
te_data.to_csv(gold_output, sep='\t', index=False, header=None)

In [64]:
!python ../../evaluate.py RNN_LSTM_ensemble.txt ../../test-gold.txt

=== Results === 

Portugese
pt-BR: 945 / 1000 = 0.945
pt-PT: 878 / 1000 = 0.878

Bulgarian, Macedonian
bg: 1000 / 1000 = 1.0
mk: 999 / 1000 = 0.999

Spanish
es-ES: 849 / 1000 = 0.849
es-AR: 926 / 1000 = 0.926

Bosnian, Croatian, Serbian
bs: 823 / 1000 = 0.823
hr: 893 / 1000 = 0.893
sr: 967 / 1000 = 0.967

Malay, Indo
my: 987 / 1000 = 0.987
id: 986 / 1000 = 0.986

Czech, Slovak
cz: 997 / 1000 = 0.997
sk: 1000 / 1000 = 1.0

Others
xx: 1000 / 1000 = 1.0

Overall: 13250 / 14000 = 0.946428571429

