In [1]:
from data import preparation
import data.hierarchy as hie
from sklearn.model_selection import train_test_split

In [2]:
with open('data/wipo_d/hierarchy.txt', 'w') as f1:
    with open('data/wipo_d/wipo_d.ht') as f:
        for l in f:
            split = l.strip().split(':')
            p = split[0]
            c = split[1].split(',')
            for cc in c:
                f1.write('%s %s\n'% (p,cc))

In [3]:
data_name = "wipo_d"

In [4]:
hierarchy, parent_of, all_name, name_to_index, level = hie.reindex_hierarchy(
                '%s/hierarchy.txt' % data_name)
hie.save_hierarchy("%s/hierarchy.pickle" % data_name, hierarchy,
                   parent_of, all_name, name_to_index, level)

In [5]:
for mode in ['train', 'test']:
    for i in range(1,6):
        datas, labels = preparation.import_data('wipo_d/folds/wipo_d_fold%d.dat.%s' % (i, mode))
        new_labels = preparation.map_index_of_label('wipo_d/hierarchy.pickle', labels)
        if mode == 'train':
            train_data, validate_data, train_target, validate_target = train_test_split(
                datas, new_labels, test_size=0.1, random_state=12345)
            preparation.save_data_in_pickle('wipo_d/fold/data_%d.pickle.%s' % (i, mode), train_data, train_target)
            preparation.save_data_in_pickle('wipo_d/fold/data_%d.pickle.%s' % (i, "validate"), validate_data, validate_target)
        else:
            preparation.save_data_in_pickle('wipo_d/fold/data_%d.pickle.%s' % (i, mode), datas, new_labels)

# Real Flow 

In [6]:
from data import Dataset
from embedding import Doc2Vec
from assemble_classifier import AssembleNoLabel, AssemblePredicted
%load_ext autoreload
%autoreload 2

In [7]:
dataset_train = Dataset("wipo_d", 1, "train")
dataset_validate = Dataset("wipo_d", 1, "validate")
dataset_test = Dataset("wipo_d", 1, "test")

In [8]:
doc2vec = Doc2Vec(dataset_train.number_of_classes(), size=100, epoch=500)
doc2vec.fit(dataset_train.datas, dataset_train.labels, dataset_validate.datas, dataset_validate.labels)

Doc2Vec by Gensim
Epoch: 10 Similar: 0.06
Epoch: 20 Similar: 0.02
Epoch: 30 Similar: 0.08
Epoch: 40 Similar: 0.13
Epoch: 50 Similar: 0.16
Epoch: 60 Similar: 0.17
Epoch: 70 Similar: 0.18
Epoch: 80 Similar: 0.19
Epoch: 90 Similar: 0.19
Epoch: 100 Similar: 0.20
Epoch: 110 Similar: 0.20
Epoch: 120 Similar: 0.20
Epoch: 130 Similar: 0.21
Epoch: 140 Similar: 0.22
Epoch: 150 Similar: 0.23
Epoch: 160 Similar: 0.23
Epoch: 170 Similar: 0.24
Epoch: 180 Similar: 0.24
Epoch: 190 Similar: 0.25
Epoch: 200 Similar: 0.26
Epoch: 210 Similar: 0.26
Epoch: 220 Similar: 0.26
Epoch: 230 Similar: 0.27
Epoch: 240 Similar: 0.27
Epoch: 250 Similar: 0.28
Epoch: 260 Similar: 0.28
Epoch: 270 Similar: 0.28
Epoch: 280 Similar: 0.28
Epoch: 290 Similar: 0.29
Epoch: 300 Similar: 0.30
Epoch: 310 Similar: 0.29
Epoch: 320 Similar: 0.30
Epoch: 330 Similar: 0.30
Epoch: 340 Similar: 0.30
Epoch: 350 Similar: 0.30
Epoch: 360 Similar: 0.30
Epoch: 370 Similar: 0.31
Epoch: 380 Similar: 0.31
Epoch: 390 Similar: 0.31
Epoch: 400 Simil

<embedding.Doc2Vec.GensimDoc2Vec at 0x1a1bc554e0>

In [9]:
dataset_train.change_to_Doc2Vec(doc2vec)
dataset_validate.change_to_Doc2Vec(doc2vec)
dataset_test.change_to_Doc2Vec(doc2vec)

In [15]:
model = AssemblePredicted("wipo_d", dataset_train, dataset_validate, dataset_test, iteration=20000, batch_size=3000, hidden_size=[300,300,300,300], target_hidden_size=[30,30,30], use_dropout=False)

In [16]:
model.train()

Level: 1.000 Epoch: 666/20000 Batch: 1/1 Loss: 0.064 Training Loss: 0.064 Validate F1 macro: 0.666
Level: 1.000 Epoch: 1166/20000 Batch: 1/1 Loss: 0.017 Training Loss: 0.017 Stopping F1 macro: 0.639

Level: 2.000 Epoch: 666/20000 Batch: 1/1 Loss: 0.037 Training Loss: 0.037 Validate F1 macro: 0.504
Level: 2.000 Epoch: 1166/20000 Batch: 1/1 Loss: 0.008 Training Loss: 0.008 Stopping F1 macro: 0.484

Level: 3.000 Epoch: 666/20000 Batch: 1/1 Loss: 0.196  Training Loss: 0.196 Validate F1 macro: 0.068
Level: 3.000 Epoch: 1332/20000 Batch: 1/1 Loss: 0.026 Training Loss: 0.026 Validate F1 macro: 0.089
Level: 3.000 Epoch: 1998/20000 Batch: 1/1 Loss: 0.008 Training Loss: 0.008 Validate F1 macro: 0.091
Level: 3.000 Epoch: 2072/20000 Batch: 1/1 Loss: 0.007 Training Loss: 0.007 Stopping F1 macro: 0.084

Level: 4.000 Epoch: 666/20000 Batch: 1/1 Loss: 1.069  Training Loss: 1.069 Validate F1 macro: 0.010
Level: 4.000 Epoch: 1332/20000 Batch: 1/1 Loss: 0.198 Training Loss: 0.198 Validate F1 macro: 0.014

In [17]:
model.tuning_threshold()

In [18]:
f1_macro, f1_micro, f1_each = model.evaluate("test")

In [19]:
print("F1 macro: %.4f F1 micro: %.4f" % (f1_macro, f1_micro))

F1 macro: 0.0622 F1 micro: 0.3379


In [20]:
for level, (macro, micro) in enumerate(f1_each):
    print("Level: %d F1 macro: %.4f F1 micro: %.4f" % (level, macro, micro))

Level: 0 F1 macro: 0.5325 F1 micro: 0.6727
Level: 1 F1 macro: 0.4185 F1 micro: 0.4854
Level: 2 F1 macro: 0.1016 F1 micro: 0.2317
Level: 3 F1 macro: 0.0363 F1 micro: 0.0937
