In [1]:
import os
from os.path import dirname, realpath, sep, pardir
import sys
sys.path.append( realpath("./") + sep + pardir )

In [2]:
from importlib import import_module

In [3]:
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras_aquarium as ka
from keras_aquarium import *
from keras import backend as K

Using TensorFlow backend.


In [4]:
import numpy as np
from scipy.sparse import *

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, defaultdict
from sklearn.datasets import fetch_20newsgroups, fetch_20newsgroups_vectorized
import itertools

In [6]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [7]:
texts = newsgroups_train.data
targets = newsgroups_train.target


In [8]:
class TextIndicesTransformer():
    
    def __init__(self, min_df=4, sents_mode=True):
        self.cv = CountVectorizer(stop_words=None, min_df=min_df, token_pattern=u'(?u)\\b\\w\\w+\\b'+u"|[\\.]+")
        self.min_df = min_df
        self.sents_mode = sents_mode
        
    def fit(self, texts):
        return self.fit_transform(texts)
        
    def fit_transform(self, texts):
        cv = self.cv
        
        vocabulary = defaultdict(lambda : 1 + vocabulary.__len__())
        if self.sents_mode:
            self.sents_sep = set([vocabulary["."], ])
        analyze = cv.build_analyzer()
        min_df = cv.min_df

        df_counter = defaultdict(int)
        words_li = []

        for doc in texts:
            word_set = set()
            text_in_idx = []

            words = analyze(doc)
            words_li.append(words)
            for word in words:
                wi = vocabulary[word]
                word_set.add(wi)

            for wi in word_set:
                df_counter[wi] += 1

        for wi, c in df_counter.items():
            if df_counter[wi] < min_df:
                del df_counter[wi]

        for word, wi in vocabulary.items():
            if wi not in df_counter:
                del vocabulary[word]

        texts_in_indices = []
        for words in words_li:
            text_in_idx = []
            for w in words:
                wi = vocabulary.get(w, None)
                if wi is not None:
                    text_in_idx.append(wi)
                    
            if self.sents_mode:
                text_in_idx = [list(y) for x, y in itertools.groupby(text_in_idx, lambda z: z in self.sents_sep) 
                               if not x]

            texts_in_indices.append(filter(lambda x: len(x) > 3, text_in_idx))
            
        self.analyze = analyze
        self.vocabulary = vocabulary
        self.df_counter = df_counter

        return texts_in_indices
    
    def transform(self, texts):
        texts_in_indices = []
        vocabulary = self.vocabulary
        analyze = self.analyze
        
        for doc in texts:
            text_in_idx = []
            
            for word in analyze(doc):
                wi = vocabulary.get(w, None)
                if wi is not None:
                    text_in_idx.append(wi)
                    
            texts_in_indices.append(text_in_idx)
        
        return texts_in_indices


In [9]:
ttt = TextIndicesTransformer(min_df=5, sents_mode=True)

In [10]:
docs = ttt.fit_transform(texts)

In [11]:
max_sents = 40
max_sent_length = 70
n_words = 100

In [12]:

sents_length_li = map(len, docs)
print len(sents_length_li)
max_sents = sorted(sents_length_li)[-500]
print 1.*sum(map(len, docs)) / len(docs)
sents = map(lambda doc: max(map(len, doc)), docs)
max_sent_length = sorted(sents)[-1000]
n_words = len(ttt.vocabulary)

11314
15.5169701255


In [13]:
X = hatt_rnn.padding_docs(docs, max_sents, max_sent_length)
Y = hatt_rnn.to_categorical(targets)

In [14]:
# K.expand_dims()


In [15]:
dgr = hatt_rnn.generate_dataset(docs, targets, max_sents, max_sent_length, batch_size=64)

In [16]:
model = hatt_rnn.HierarchicalAttentionRNN(
    max_sents, max_sent_length, n_classes=20, n_words=n_words, 
    word_dim=100,
)
# K.squeeze()

hit.shape: (None, 68, 200)
ai_sum.shape: (None, 1)
weights.shape: (None, 68)
ai_sum.shape: (None, 1)
weights.shape: (None, 68)
ai_sum.shape: (None, 1)
weights.shape: (None, 68)
weighted_input.shape: (None, 200)
hit.shape: (None, None, 200)
ai_sum.shape: (None, 1)
weights.shape: (None, None)
ai_sum.shape: (None, 1)
weights.shape: (None, None)
ai_sum.shape: (None, 1)
weights.shape: (None, None)
weighted_input.shape: (None, 200)
hit.shape: (None, 42, 200)
ai_sum.shape: (None, 1)
weights.shape: (None, 42)
ai_sum.shape: (None, 1)
weights.shape: (None, 42)
ai_sum.shape: (None, 1)
weights.shape: (None, 42)
weighted_input.shape: (None, 200)


In [17]:
# model.fit(X, Y, epochs=5, verbose=2)
hatt_rnn.train_model(model, dgr, steps_per_epoch=160, epochs=2, verbose=2)

Epoch 1/2
Epoch 2/2


<keras.engine.training.Model at 0x7fbeb3ba7b50>

In [18]:
sent_weights = hatt_rnn.get_sent_weights(model, docs)

In [20]:
print sent_weights[0]

[  2.32648432e-01   5.23698926e-01   1.79654241e-01   2.51812488e-03
   1.06493675e-03   3.72392475e-03   5.62757850e-02   9.92439527e-05
   1.63925652e-05   9.71007012e-06   7.62801392e-06   6.76895252e-06
   6.38542997e-06   6.23405913e-06   6.20840456e-06   6.25271059e-06
   6.33676791e-06   6.44111378e-06   6.55375607e-06   6.66773576e-06
   6.77947992e-06   6.88756154e-06   6.99176599e-06   7.09273991e-06
   7.19112404e-06   7.28702616e-06   7.38198514e-06   7.47777540e-06
   7.57651651e-06   7.68072823e-06   7.79403035e-06   7.92255469e-06
   8.07534980e-06   8.26596442e-06   8.51474215e-06   8.85901591e-06
   9.35994012e-06   1.01346814e-05   1.14500126e-05   1.40300190e-05
   2.00968825e-05   3.80877100e-05]
