In [1]:
import fasttext
import os
import json
from utils import fixText

TEMP_DIR = './tmp/'

def getConfig():
    config = {}
    with open('./config.json', 'r') as f:
        s = f.read()
        config = json.loads(s)
    return config

# 读入数据集的json文件，处理成fasttext接口使用的"文本__label__标签"形式，以txt文件存储
def readDataSet(path):
    with open(path, 'r') as f:
        s = f.read()
        data_set = json.loads(s)
    set1 = []
    set2 = []
    set3 = []
    for id in list(data_set.keys()):
        set1.append('__label__' + data_set[id]['tag_level_1'] + ' ' + fixText(data_set[id]['text']))
        set2.append('__label__' + data_set[id]['tag_level_2'] + ' ' + fixText(data_set[id]['text']))
        set3.append('__label__' + data_set[id]['tag_level_3'] + ' ' + fixText(data_set[id]['text']))
    try:
        os.mkdir(TEMP_DIR)
    except:
        pass
    with open(TEMP_DIR + 'set1.txt', 'w') as f:
        for l in set1:
            f.write(l + '\n')
    with open(TEMP_DIR + 'set2.txt', 'w') as f:
        for l in set2:
            f.write(l + '\n')
    with open(TEMP_DIR + 'set3.txt', 'w') as f:
        for l in set3:
            f.write(l + '\n')

In [2]:
config = getConfig()
readDataSet(config['data_path'] + config['data_set_name'])
model_label1 = fasttext.train_supervised(
    input = TEMP_DIR + '',
    lr = config['lr'],
    dim = config['hidden_dim'],
    epoch = config['epoch']
)
model_label2 = fasttext.train_supervised(
    input = TEMP_DIR + 'set2.txt',
    lr = config['lr'],
    dim = config['hidden_dim'],
    epoch = config['epoch']
)
model_label3 = fasttext.train_supervised(
    input = TEMP_DIR + 'set3.txt',
    lr = config['lr'],
    dim = config['hidden_dim'],
    epoch = config['epoch']
)

try:
    os.remove(TEMP_DIR + 'set1.txt')
    os.remove(TEMP_DIR + 'set2.txt')
    os.remove(TEMP_DIR + 'set3.txt')
    os.removedirs(TEMP_DIR)  
except:
    pass

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.676 seconds.
Prefix dict has been built successfully.
Read 16M words
Number of words:  131298
Number of labels: 86
Progress: 100.0% words/sec/thread:  560173 lr:  0.000000 avg.loss:  0.378137 ETA:   0h 0m 0s
Read 16M words
Number of words:  131298
Number of labels: 463
Progress: 100.0% words/sec/thread:  176298 lr:  0.000000 avg.loss:  1.296705 ETA:   0h 0m 0s 6m13s 1.369128 ETA:   0h 1m49s 86.1% words/sec/thread:  173614 lr:  0.001385 avg.loss:  1.356292 ETA:   0h 1m29s lr:  0.000830 avg.loss:  1.327033 ETA:   0h 0m53s words/sec/thread:  174500 lr:  0.000810 avg.loss:  1.325837 ETA:   0h 0m51ss avg.loss:  1.301190 ETA:   0h 0m 7s 99.0% words/sec/thread:  176066 lr:  0.000103 avg.loss:  1.301037 ETA:   0h 0m 6s  0h 0m 5sh 0m 3s 176268 lr:  0.000022 avg.loss:  1.297661 ETA:   0h 0m 1s
Read 16M words
Number of words:  131311
Number of labels: 1164
Progress:  67.5% words/sec

lr = 0.1, epoch = 50

In [3]:
with open(config['data_path'] + 'test_set.json', 'r') as f:
    content = json.loads(f.read())

total = 0
true_tag1 = 0
true_tag2 = 0
true_tag3 = 0
for id in content.keys():
    total += 1
    text = content[id]['text']
    text = fixText(text)
    tag1 = content[id]['tag_level_1']
    tag2 = content[id]['tag_level_2']
    tag3 = content[id]['tag_level_3']
    predict1 = model_tag1.predict(text)[0][0]
    predict2 = model_tag2.predict(text)[0][0]
    predict3 = model_tag3.predict(text)[0][0]
    if(predict1.replace('__label__', '') == tag1):
        true_tag1 += 1
    if(predict2.replace('__label__', '') == tag2):
        true_tag2 += 1
    if(predict3.replace('__label__', '') == tag3):
        true_tag3 += 1
print('tag level 1 accurate: {}% ({}/{})'.format(true_tag1 * 100 / total, true_tag1, total))
print('tag level 2 accurate: {}% ({}/{})'.format(true_tag2 * 100 / total, true_tag2, total))
print('tag level 3 accurate: {}% ({}/{})'.format(true_tag3 * 100 / total, true_tag3, total))

tag level 1 accurate: 93.85076963214192% (35973/38330)
tag level 2 accurate: 74.5995303939473% (28594/38330)
tag level 3 accurate: 65.4056874510827% (25070/38330)


lr = 0.01 epoch = 20

In [3]:
with open(config['data_path'] + 'test_set.json', 'r') as f:
    content = json.loads(f.read())

total = 0
true_tag1 = 0
true_tag2 = 0
true_tag3 = 0
for id in content.keys():
    total += 1
    text = content[id]['text']
    text = fixText(text)
    tag1 = content[id]['tag_level_1']
    tag2 = content[id]['tag_level_2']
    tag3 = content[id]['tag_level_3']
    predict1 = model_tag1.predict(text)[0][0]
    predict2 = model_tag2.predict(text)[0][0]
    predict3 = model_tag3.predict(text)[0][0]
    if(predict1.replace('__label__', '') == tag1):
        true_tag1 += 1
    if(predict2.replace('__label__', '') == tag2):
        true_tag2 += 1
    if(predict3.replace('__label__', '') == tag3):
        true_tag3 += 1
print('tag level 1 accurate: {}% ({}/{})'.format(true_tag1 * 100 / total, true_tag1, total))
print('tag level 2 accurate: {}% ({}/{})'.format(true_tag2 * 100 / total, true_tag2, total))
print('tag level 3 accurate: {}% ({}/{})'.format(true_tag3 * 100 / total, true_tag3, total))

tag level 1 accurate: 94.004696060527% (36032/38330)
tag level 2 accurate: 63.04200365249152% (24164/38330)
tag level 3 accurate: 44.29689538220715% (16979/38330)


lr = 0.001 结果

In [4]:
with open(config['data_path'] + 'test_set.json', 'r') as f:
    content = json.loads(f.read())

total = 0
true_tag1 = 0
true_tag2 = 0
true_tag3 = 0
for id in content.keys():
    total += 1
    text = content[id]['text']
    text = fixText(text)
    tag1 = content[id]['tag_level_1']
    tag2 = content[id]['tag_level_2']
    tag3 = content[id]['tag_level_3']
    predict1 = model_label1.predict(text)[0][0]
    predict2 = model_label2.predict(text)[0][0]
    predict3 = model_label3.predict(text)[0][0]
    if(predict1.replace('__label__', '') == tag1):
        true_tag1 += 1
    if(predict2.replace('__label__', '') == tag2):
        true_tag2 += 1
    if(predict3.replace('__label__', '') == tag3):
        true_tag3 += 1
print('tag level 1 accurate: {}% ({ }/{})'.format(true_tag1 * 100 / total, true_tag1, total))
print('tag level 2 accurate: {}% ({}/{})'.format(true_tag2 * 100 / total, true_tag2, total))
print('tag level 3 accurate: {}% ({}/{})'.format(true_tag3 * 100 / total, true_tag3, total))

tag level 1 accurate: 93.85076963214192% (35973/38330)
tag level 2 accurate: 74.5995303939473% (28594/38330)
tag level 3 accurate: 65.4056874510827% (25070/38330)


lr = 0.01 epoch = 20

In [3]:
with open(config['data_path'] + 'test_set.json', 'r') as f:
    content = json.loads(f.read())

total = 0
true_tag1 = 0
true_tag2 = 0
true_tag3 = 0
for id in content.keys():
    total += 1
    text = content[id]['text']
    text = fixText(text)
    tag1 = content[id]['tag_level_1']
    tag2 = content[id]['tag_level_2']
    tag3 = content[id]['tag_level_3']
    predict1 = model_label1.predict(text)[0][0]
    predict2 = model_label2.predict(text)[0][0]
    predict3 = model_label3.predict(text)[0][0]
    if(predict1.replace('__label__', '') == tag1):
        true_tag1 += 1
    if(predict2.replace('__label__', '') == tag2):
        true_tag2 += 1
    if(predict3.replace('__label__', '') == tag3):
        true_tag3 += 1
print('tag level 1 accurate: {}% ({}/{})'.format(true_tag1 * 100 / total, true_tag1, total))
print('tag level 2 accurate: {}% ({}/{})'.format(true_tag2 * 100 / total, true_tag2, total))
print('tag level 3 accurate: {}% ({}/{})'.format(true_tag3 * 100 / total, true_tag3, total))

tag level 1 accurate: 94.004696060527% (36032/38330)
tag level 2 accurate: 63.04200365249152% (24164/38330)
tag level 3 accurate: 44.29689538220715% (16979/38330)


lr = 0.001 结果

In [7]:
with open(config['data_path'] + 'test_set.json', 'r') as f:
    content = json.loads(f.read())

total = 0
true_tag1 = 0
true_tag2 = 0
true_tag3 = 0
for id in content.keys():
    total += 1
    text = content[id]['text']
    text = fixText(text)
    tag1 = content[id]['tag_level_1']
    tag2 = content[id]['tag_level_2']
    tag3 = content[id]['tag_level_3']
    predict1 = model_label1.predict(text)[0][0]
    predict2 = model_label2.predict(text)[0][0]
    predict3 = model_label3.predict(text)[0][0]
    if(predict1.replace('__label__', '') == tag1):
        true_tag1 += 1
    if(predict2.replace('__label__', '') == tag2):
        true_tag2 += 1
    if(predict3.replace('__label__', '') == tag3):
        true_tag3 += 1
print('tag level 1 accurate: {}% ({}/{})'.format(true_tag1 * 100 / total, true_tag1, total))
print('tag level 2 accurate: {}% ({}/{})'.format(true_tag2 * 100 / total, true_tag2, total))
print('tag level 3 accurate: {}% ({}/{})'.format(true_tag3 * 100 / total, true_tag3, total))

tag level 1 accurate: 93.98121575789199%(36023/38330)
tag level 2 accurate: 35.330028698147665%(13542/38330)
tag level 3 accurate: 25.927471954082964%(9938/38330)


标签数量

In [11]:
print('tag 1 num: ', len(model_label1.labels))
print('tag 2 num: ', len(model_label2.labels))
print('tag 3 num: ', len(model_label3.labels))

87
451
1171
