# Real Flow 

In [1]:
from data import Dataset
from embedding import Doc2Vec, NoTag_Doc2Vec, OnlyLeafDoc2Vec
from assemble_classifier import AssemblePredicted, AssemblePredictedHidden, AssemblePredictedHiddenAdd
import shutil
import os
import numpy as np
import json

In [2]:
config = json.load(open('config.json'))
data_name = config['data_name']
train_file_name = config['train_file_name']
test_file_name = config['test_file_name']
classification_type = config['classification_type']
test_split = config['test_split']
predict_test = config['predict_test']
evaluate_test = config['evaluate_test']
correction = config['correction']
mandatory_leaf = config['mandatory_leaf']
hidden = config['hidden']
target_hidden = config['target_hidden']
embedding_size = config['embedding_size']
embedding_type = 'OPD'

In [3]:
print("---------------Preprocessing-----------------")

---------------Preprocessing-----------------


In [4]:
if classification_type == "multi-class":
    mandatory_leaf = True
elif classification_type == "multi-label":
    mandatory_leaf = False

In [5]:
if not os.path.isdir('export/%s' % data_name):
    os.makedirs('export/%s' % data_name)

In [6]:
if os.path.isdir('data/%s/output' % data_name):
    shutil.rmtree('data/%s/output' % data_name)
if os.path.isdir('data/%s/store' % data_name):
    shutil.rmtree('data/%s/store' % data_name)

In [7]:
dataset_train = Dataset(data_name, "train", test_split=test_split, classification_type=classification_type, data_file_name=train_file_name)
dataset_validate = Dataset(data_name, "validate", test_split=test_split, classification_type=classification_type, data_file_name=train_file_name)
if (test_split or predict_test or evaluate_test):
    dataset_test = Dataset(data_name, "test", test_split=test_split, classification_type=classification_type, data_file_name=test_file_name)
else:
    dataset_test = "temp"

In [8]:
print("---------------Training document embedding-----------------")

---------------Training document embedding-----------------


In [9]:
if embedding_type == "LOD":
    doc2vec = OnlyLeafDoc2Vec(data_name, dataset_train.number_of_classes(), size=embedding_size, epoch=270, batch_size=10000)
elif embedding_type == "Normal":
    doc2vec = NoTag_Doc2Vec(data_name, dataset_train.number_of_classes(), size=embedding_size, epoch=270, batch_size=10000)
else:
    doc2vec = Doc2Vec(data_name, dataset_train.number_of_classes(), size=embedding_size, epoch=270, batch_size=10000)
doc2vec.fit(dataset_train.datas, dataset_train.labels, dataset_validate.datas, dataset_validate.labels, early_stopping=False)
# doc2vec.load_model('export/%s/doc2vec.model' % data_name)

Doc2Vec by Gensim
Epoch: 5
Epoch: 10
Epoch: 15
Epoch: 20
Epoch: 25
Epoch: 30
Epoch: 35
Epoch: 40
Epoch: 45
Epoch: 50
Epoch: 55
Epoch: 60
Epoch: 65
Epoch: 70
Epoch: 75
Epoch: 80
Epoch: 85
Epoch: 90
Epoch: 95
Epoch: 100
Epoch: 105
Epoch: 110
Epoch: 115
Epoch: 120
Epoch: 125
Epoch: 130
Epoch: 135
Epoch: 140
Epoch: 145
Epoch: 150
Epoch: 155
Epoch: 160
Epoch: 165
Epoch: 170
Epoch: 175
Epoch: 180
Epoch: 185
Epoch: 190
Epoch: 195
Epoch: 200
Epoch: 205
Epoch: 210
Epoch: 215
Epoch: 220
Epoch: 225
Epoch: 230
Epoch: 235
Epoch: 240
Epoch: 245
Epoch: 250
Epoch: 255
Epoch: 260
Epoch: 265
Epoch: 270


(61.71719463794517, 17.734233618410826, 0.2873467227803547)

In [10]:
dataset_train.change_to_Doc2Vec(doc2vec)
dataset_validate.change_to_Doc2Vec(doc2vec)
if (test_split or predict_test or evaluate_test):
    dataset_test.change_to_Doc2Vec(doc2vec)

In [11]:
if hidden == 'auto' or target_hidden == 'auto':
    a = []
    for i in range(len(dataset_train.level)-1):
        a.append(dataset_train.level[i+1] - dataset_train.level[i])
    a = np.array(a)

    if hidden == 'auto':
        hidden = a*2 + 300
        hidden[hidden > 3000] = 3000
        hidden = hidden.tolist()
    if target_hidden == 'auto':
        target_hidden = a[:-1]*2 + 30
        target_hidden[target_hidden > 100] = 100
        target_hidden = target_hidden.tolist()


In [12]:
print("---------------Training classifiers-----------------")

---------------Training classifiers-----------------


In [None]:
model = AssemblePredicted(data_name, dataset_train, dataset_validate, dataset_test, iteration=2000, stopping_time=300, batch_size=65536, hidden_size=hidden, target_hidden_size=target_hidden, use_dropout=True, start_level=0)

In [None]:
model.train()

Level: 1.000 Epoch: 66/2000 Batch: 1/1 Loss: 0.191 Training F1 macro: 0.979 Validate F1 macro: 0.643
Level: 1.000 Epoch: 132/2000 Batch: 1/1 Loss: 0.185 Training F1 macro: 0.981 Validate F1 macro: 0.642
Level: 1.000 Epoch: 198/2000 Batch: 1/1 Loss: 0.164 Training F1 macro: 0.985 Validate F1 macro: 0.642
Level: 1.000 Epoch: 264/2000 Batch: 1/1 Loss: 0.154 Training F1 macro: 0.987 Validate F1 macro: 0.642
Level: 1.000 Epoch: 330/2000 Batch: 1/1 Loss: 0.138 Training F1 macro: 0.989 Validate F1 macro: 0.642
Level: 1.000 Epoch: 366/2000 Batch: 1/1 Loss: 0.128 Stopping F1 macro: 0.991 Validate F1 macro: 0.656

Level: 2.000 Epoch: 66/2000 Batch: 1/1 Loss: 0.374 Training F1 macro: 0.971 Validate F1 macro: 0.491
Level: 2.000 Epoch: 132/2000 Batch: 1/1 Loss: 0.331 Training F1 macro: 0.975 Validate F1 macro: 0.492
Level: 2.000 Epoch: 198/2000 Batch: 1/1 Loss: 0.327 Training F1 macro: 0.981 Validate F1 macro: 0.500
Level: 2.000 Epoch: 264/2000 Batch: 1/1 Loss: 0.308 Training F1 macro: 0.982 Valida

In [None]:
threshold = model.tuning_threshold()

In [None]:
f = open('export/%s/result.txt' % data_name, 'w')

In [None]:
print("---------------Evaluation-----------------")

In [None]:
list_of_mode = ['train', 'validate']
if (test_split or predict_test or evaluate_test):
    list_of_mode.append('test')

In [None]:
for mode in list_of_mode:
    if predict_test or mode != 'test':
        model.export_result(mode, correction=correction, mandatory_leaf=mandatory_leaf)
    if evaluate_test or mode != 'test':
        f1_macro, f1_micro, f1_each = model.evaluate(mode, correction=correction, mandatory_leaf=mandatory_leaf)
        f.write("--------------------------- %s -------------------------------\n" % mode)
        print("--------------------------- %s -------------------------------" % mode)
        f.write("F1 macro: %.4f F1 micro: %.4f\n" % (f1_macro, f1_micro))
        print("F1 macro: %.4f F1 micro: %.4f" % (f1_macro, f1_micro))
        if classification_type == 'hierarchical':
            for level, (macro, micro) in enumerate(f1_each):
                f.write("Level: %d F1 macro: %.4f F1 micro: %.4f\n" % (level, macro, micro))
                print("Level: %d F1 macro: %.4f F1 micro: %.4f" % (level, macro, micro))
            f.write('\n')
            print('')

In [None]:
f.close()

In [None]:
config['hidden'] =  hidden
config['target_hidden'] = target_hidden
config['threshold'] = threshold
with open('export/%s/model_detail.json' % data_name, 'w') as f:
    json.dump(config, f)