In [7]:
# 人手抽出されたデータ
!ls ../data/ASPEC-JE/gold_term_list/

dev.dic.full     dev.en_term      dev.ja_term      test.dic.full
dev.en           dev.ja           devtest.dic.full


In [3]:
# 自動抽出されたデータ
!ls ../work/ASPEC_JE/

dev_eng_extracted.txt        devtest_janome_extracted.txt
dev_janome_extracted.txt     test_eng_extracted.txt
devtest_eng_extracted.txt    test_janome_extracted.txt


In [54]:
!head ../work/ASPEC_JE/dev_janome_extracted.txt

測定	1055.821954687437
可能性	664.1264722904236
性	657.6663287716651
研究	560.0285706997456
治療	534.505378831682
Ｉ	509.22293742524994
技術	460.0869483043395
Ａ	444.48622025885123
膜	441.23463146040564
方法	362.10219551944175


## 自動抽出されたterm setと人手抽出されたterm setの一致率

In [45]:
def get_term_set(term_path, data_type='gold'):
    term_set = set()
    for line in open(term_path):
        line = line.strip()
        if data_type == 'gold':
            terms = line.split('\t')
            for term in terms:
                term_set.add(term)
        else: # 'auto'
            term, _ = line.split('\t')
            term_set.add(term)
    return term_set

In [53]:
def calc_term_num(data_type, lang):
    parser = {'ja': 'janome', 'en': 'eng', 'zh': 'nlpir'}
    dev_human_path = f'../data/ASPEC-JE/gold_term_list/{data_type}.{lang}_term'
    dev_auto_path = f'../work/ASPEC_JE/{data_type}_{parser[lang]}_extracted.txt'
    human_term_set = get_term_set(dev_human_path, 'gold')
    auto_term_set = get_term_set(dev_auto_path, 'auto')

    print(f'{data_type}-{lang}')
    print(f'human: {len(human_term_set)}\nauto: {len(auto_term_set)}')
    covered_term_set = human_term_set & auto_term_set
    print(f'covered: {len(covered_term_set)} ({len(covered_term_set)/len(human_term_set)*100: .1f}%)')
    print()
    return covered_term_set
    
calc_term_num('dev', 'ja')
calc_term_num('dev', 'en')
calc_term_num('devtest', 'ja')
calc_term_num('devtest', 'en')
calc_term_num('test', 'ja')
calc_term_num('test', 'en')
print()

dev-ja
human: 4444
auto: 6669
covered: 3070 ( 69.1%)

dev-en
human: 4480
auto: 7716
covered: 2399 ( 53.5%)

devtest-ja
human: 5046
auto: 6780
covered: 3576 ( 70.9%)

devtest-en
human: 5032
auto: 7739
covered: 2732 ( 54.3%)

test-ja
human: 5055
auto: 6696
covered: 3583 ( 70.9%)

test-en
human: 5116
auto: 7771
covered: 2712 ( 53.0%)




In [59]:
covered_term_set = calc_term_num('dev', 'ja')
auto_dic = {line.strip().split('\t')[0]: line.strip().split('\t')[1] for line in open(f'../work/ASPEC_JE/dev_janome_extracted.txt')}

covered_term_list = [(term, auto_dic[term]) for term in covered_term_set]

with open('../work/ASPEC_JE/dev_covered_terms.txt', 'w') as f_w:
    for t in  sorted(covered_term_list, key=lambda x: float(x[1]), reverse=True):
        f_w.write(t[0] + '\t' + t[1] + '\n')

dev-ja
human: 4444
auto: 6669
covered: 3070 ( 69.1%)



- enja両側で同じもの/アルファベットだけのものもあるようなのでこの辺りはクリーニング対象
    - langidかけたら解決しないかな？

In [1]:
!head ../work/ASPEC_JC/dev.ja_term_list.txt

環境
システム
評価
研究
情報
可能性
データ
実験
処理
影響


In [6]:
import re
dataset = 'test'
lang = 'zh'

input_path = f'../work/ASPEC_JC/{dataset}.{lang}_term_list.txt'
output_path = f'../work/ASPEC_JC/cleaned_{dataset}.{lang}_term_list.txt'

def is_filtered(term):
    is_filtered = False
    not_ja_pattern = re.compile('[a-zA-Zａ-ｚＡ-Ｚ0-9０-９αθσ℃，・°（）＠．／＋＞＜：＝＊]+')
    one_char_pattern = re.compile('\S')
    
    patterns = [
        not_ja_pattern, 
        one_char_pattern
    ]
    
    for p in patterns:
        if p.fullmatch(term):
            is_filtered = True
    return is_filtered

with open(output_path, 'w') as f_w:
    for line in open(input_path):
        term = line.strip()

        if is_filtered(term):
            continue
        else:
            f_w.write(term + '\n')

- 一応clean後のデータでカバー率を数えてみる

In [70]:
def calc_term_num(data_type, lang):
    parser = {'ja': 'janome', 'en': 'eng', 'zh': 'nlpir'}
    dev_human_path = f'../data/ASPEC-JE/gold_term_list/{data_type}.{lang}_term'
    dev_auto_path = f'../work/ASPEC_JE/cleaned_{data_type}.ja_term_list.txt'
    human_term_set = get_term_set(dev_human_path, 'gold')
    auto_term_set = get_term_set(dev_auto_path, 'gold')

    print(f'{data_type}-{lang}')
    print(f'human: {len(human_term_set)}\nauto: {len(auto_term_set)}')
    covered_term_set = human_term_set & auto_term_set
    print(f'covered: {len(covered_term_set)} ({len(covered_term_set)/len(human_term_set)*100: .1f}%)')
    print()
    return covered_term_set

In [71]:
calc_term_num('dev', 'ja')
calc_term_num('devtest', 'ja')
calc_term_num('test', 'ja')
print()

dev-ja
human: 4444
auto: 6053
covered: 2926 ( 65.8%)

devtest-ja
human: 5046
auto: 6232
covered: 3430 ( 68.0%)

test-ja
human: 5055
auto: 6089
covered: 3424 ( 67.7%)


