In [6]:
import numpy as np
import pandas as pd
from lxml import html
from itertools import izip, chain,islice

from passage.models import RNN
from passage.updates import Adadelta
from passage.updates import NAG, Regularizer
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import *

from sklearn import preprocessing

Using gpu device 0: GRID K520


In [7]:
class CharTokenize(Tokenizer):
    def __init__(self, max_features=9997, min_df=10, lowercase=True, character=False, charn=1):
        super(CharTokenize, self).__init__(max_features, min_df, lowercase, character)
        self.charn = charn
        
    def ntuples(self, lst, n):
        iters = izip(*[chain(islice(lst,i,None)) for i in range(n)])
        return [''.join(i) for i in iters]
    
    def fit(self, texts):
        if self.lowercase:
            texts = [text.lower() for text in texts]
        if self.character:
            tokens = [self.ntuples(list(text.decode("utf-8")), self.charn) for text in texts]
#             print tokens
        else:
            tokens = [tokenize(text) for text in texts]
        self.encoder = token_encoder(tokens, max_features=self.max_features-3, min_df=self.min_df)
        self.encoder['PAD'] = 0
        self.encoder['END'] = 1
        self.encoder['UNK'] = 2
        self.decoder = dict(zip(self.encoder.values(), self.encoder.keys()))
        self.n_features = len(self.encoder)
        return self
    
    def transform(self, texts):
        if self.lowercase:
            texts = [text.lower() for text in texts]
        if self.character:
            texts = [self.ntuples(list(text.decode("utf-8")), self.charn) for text in texts]
        else:
            texts = [tokenize(text) for text in texts]
        tokens = [[self.encoder.get(token, 2) for token in text] for text in texts]
        return tokens

In [8]:
trainFile = '../../comb_bsc.csv'

In [9]:
comb_data = pd.read_csv(trainFile, encoding='utf-8', sep=r'\t+', header=None, names=['text', 'label'])
combX = comb_data['text'].values
combY = comb_data['label'].values

  if __name__ == '__main__':


In [10]:
le = preprocessing.LabelEncoder()
combY_t = le.fit_transform(combY)
le.classes_

array(['bs', 'hr', 'sr'], dtype=object)

## Word unigrams

In [7]:
tokenizer_w = CharTokenize(min_df=2, max_features=1000000)

In [8]:
combX_w = tokenizer_w.fit_transform(combX)

  if t in punctuation:


In [15]:
tokenizer_w.n_features

726488

In [None]:
layers = [
    Embedding(size=128, n_features=tokenizer_w.n_features),
    GatedRecurrent(size=512, p_drop=0.4),
    Dense(size=3, activation='softmax', p_drop=0.2)
]

In [None]:
model_w = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
model_w.fit(combX_w, combY_t, n_epochs=10)

Epoch 0 Seen 501053 samples Avg cost 0.1441 Time elapsed 2155 seconds
Epoch 1 Seen 1002106 samples Avg cost 0.1196 Time elapsed 4309 seconds
Epoch 2 Seen 1503159 samples Avg cost 0.0929 Time elapsed 6463 seconds
Epoch 3 Seen 2004212 samples Avg cost 0.0757 Time elapsed 8617 seconds
Epoch 4 Seen 2505265 samples Avg cost 0.0659 Time elapsed 10771 seconds
Epoch 5 Seen 3006318 samples Avg cost 0.0544 Time elapsed 12925 seconds
Epoch 6 Seen 3313198 samples Avg cost 0.0489 Time left 834 seconds

## 4 grams

In [11]:
import pickle

In [12]:
pkl_file = open('../../tokenizer4.pkl', 'rb') # connect to the pickled data
tokenizer4 = pickle.load(pkl_file) # load it into a variable
pkl_file.close()

In [13]:
tokenizer4.n_features

858544

In [None]:
combX_4 = tokenizer4.transform(combX)

In [None]:
combX_4[1]

In [None]:
layers = [
    Embedding(size=128, n_features=tokenizer4.n_features),
    GatedRecurrent(size=512, p_drop=0.4),
    Dense(size=3, activation='softmax', p_drop=0.2)
]

In [None]:
model_4 = RNN(layers=layers, cost='cce', updater=Adadelta(lr=0.5))
model_4.fit(combX_4, combY_t, n_epochs=10)

## Combined model

In [24]:
g_data = pd.read_csv('../../test-gold.txt', encoding='utf-8', sep=r'\t+', header=None, names=['text', 'label'])
gX = g_data['text'].values
gY = g_data['label'].values

  if __name__ == '__main__':


In [16]:
def getY(fl):
    data = pd.read_csv(fl, encoding='utf-8', sep=r'\t+', header=None, names=['text', 'label'])
    trY = data['label'].values
    return trY

In [20]:
c5g = getY('RNN_LSTM_C5G.txt')
c5g_d = pd.get_dummies(c5g)
c2g = getY('RNN_LSTM_C2G.txt')
c2g_d = pd.get_dummies(c2g)
c2g = getY('RNN_LSTM_C2G.txt')
c2g_d = pd.get_dummies(c2g)
c3g = getY('RNN_LSTM_C3G.txt')
c3g_d = pd.get_dummies(c3g)
c4g = getY('RNN_LSTM_1L.txt')
c4g_d = pd.get_dummies(c4g)

  from ipykernel import kernelapp as app


In [21]:
feats = pd.concat([c2g_d, c3g_d, c4g_d, c5g_d], axis=1)

In [22]:
Xtrain = feats.values

In [26]:
le2 = preprocessing.LabelEncoder()
Ytrain = le2.fit_transform(gY)
le2.classes_

array(['bg', 'bs', 'cz', 'es-AR', 'es-ES', 'hr', 'id', 'mk', 'my', 'pt-BR',
       'pt-PT', 'sk', 'sr', 'xx'], dtype=object)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_curve

In [28]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
accuracy_score(Ytrain, model.predict(Xtrain))

0.94864285714285712

In [33]:
preds = model.predict(Xtrain)
inv_preds = le2.inverse_transform(preds)

In [61]:
err_idx = np.logical_or(np.logical_or(inv_preds=='hr', inv_preds=='bs'), inv_preds=='sr')

In [62]:
gX_err = gX[err_idx]

In [65]:
gY_err = gY[err_idx]

In [69]:
gX_err_t = tokenizer_w.transform(gX_err)

In [74]:
preds_err = np.argmax(model_w.predict(gX_err_t), axis=1)

In [81]:
accuracy_score(gY_err, le.inverse_transform(preds_err))

0.49766666666666665

In [97]:
accuracy_score(gY_err, le2.inverse_transform(model.predict(Xtrain)[err_idx]))

0.89866666666666661

In [96]:
(1-0.8986666)*3000

304.00020000000006

In [94]:
105+69+78+9+36+7

304