# MEに基づくモデルを用いた日本語係り受け解析

In [124]:
from typing import NamedTuple
import re

class JumanMorph(NamedTuple):
    # 見出し
    surface: str
    # 読み
    yomi: str
    # 基本形
    base: str
    # 品詞（動詞、形容詞、...）: 内本+ 1999 の「品詞(Major)」
    pos_maj: str
    # 品詞細分類（普通名詞、サ変名詞、...）
    pos_min: str
    # 活用（母音動詞、子音動詞カ行、...）: 内本+ 1999 の「活用(Major)」
    infl_type: str
    # 活用形（語幹、基本形、未然形、...）: 内本+ 1999 の「活用(Minor)」
    infl_form: str


# attribute:
#  head:     係り先の文節番号。最後の文節の head は -1。
#  dep_type: 係り関係のタイプ（D, P, A, I）
#  morphs:   形態素（JumanMorph）のリスト
class JumanChunk:
    def __init__(self, head, dep_type, morphs):
        self.head = head
        self.dep_type = dep_type
        self.morphs = morphs

    def __repr__(self):
        return f"JumanChunk(head={self.head}, dep_type=\"{self.dep_type}\", morphs={self.morphs})"
        


# input:
#   file ... file stream
# output:
#   読み込み成功: JumanChunk のリスト
#   ファイル終端: None
def read_kc_sentence(file):
    head = None
    dep_type = None
    morphs = []
    sentence = []
    for line in file:
        line = line.rstrip()

        if line == "EOS":
            sentence.append(JumanChunk(head, dep_type, morphs))
            return sentence
        elif line == "":
            continue
        elif line[0] == "#":
            continue
        elif line[0] == "*":
            if head:
                sentence.append(JumanChunk(head, dep_type, morphs))
                head = None
                dep_type = None
                morphs = []

            m = re.match(r'\*\s+(\d+)\s+(-?\d+)([DPAI])', line)
            if m:
                chunk_id = int(m.group(1))
                head = int(m.group(2))
                dep_type = m.group(3)
                if chunk_id != len(sentence):
                    raise RuntimeError(f"wrong chunk number: {line}")
            else:
                raise RuntimeError(f"data format error: {line}")
        else:
            fields = line.split()
            if len(fields) != 7:
                raise RuntimeError(f"bad morph line: {line}")
            morphs.append(JumanMorph(*fields))

    # EOS なしでファイル終了
    if sentence:
        return sentence
    else:
        return None


# 京大コーパス形式で文字列化
# input:
#   s ... JumanChunk のリストで表した文
def dump_kc_sentence(s):
    lines = []
    for i, chunk in enumerate(s):
        lines.append(f"* {i} {chunk.head}{chunk.dep_type}")
        for morph in chunk.morphs:
            lines.append(" ".join([morph.surface, morph.yomi, morph.base, morph.pos_maj, morph.pos_min, morph.infl_type, morph.infl_form]))

    lines.append("EOS")

    return "\n".join(lines)


# 京大コーパス形式のファイルの読み込み
def read_kyoto_corpus_file(file):
    sentences = []
    s = read_kc_sentence(file)
    while s:
        sentences.append(s)
        s = read_kc_sentence(file)
    return sentences


# 訓練/開発/テスト分割
# 内本+ 1999 の分割
uchimoto1999 = {
    "train": [
        "950101.KNP",
        "950103.KNP",
        "950104.KNP",
        "950105.KNP",
        "950106.KNP",
        "950107.KNP",
        "950108.KNP"
    ],
    "dev": [
        "950110.KNP"
    ],
    "test": [
        "950109.KNP"
    ]
}


# 京大コーパスの読み込み
def load_kyoto_corpus(base_dir="/home/corpus/KyotoCorpus4.0/dat/syn", split=uchimoto1999):
    corpus = {}
    for section in ["train", "dev", "test"]:
        corpus[section] = []
        for fn in split[section]:
            with open(base_dir + "/" + fn) as f:
                corpus[section].extend(read_kyoto_corpus_file(f))

    return corpus["train"], corpus["dev"], corpus["test"]


In [188]:
train, dev, test = load_kyoto_corpus()

In [241]:
len(train)

7635

In [254]:
def has_loop(s):
    for i in range(len(s)):
        if s[i].head == i:
            return True
    return False

In [255]:
train_sub = [s for s in train if not has_loop(s)]

In [256]:
len(train_sub)

7632

In [259]:
train[992] in train_sub

False

In [244]:
train[992]

[JumanChunk(head=3, dep_type="D", morphs=[JumanMorph(surface='「', yomi='「', base='*', pos_maj='特殊', pos_min='括弧始', infl_type='*', infl_form='*'), JumanMorph(surface='いや', yomi='いや', base='*', pos_maj='感動詞', pos_min='*', infl_type='*', infl_form='*'), JumanMorph(surface='、', yomi='、', base='*', pos_maj='特殊', pos_min='読点', infl_type='*', infl_form='*')]),
 JumanChunk(head=2, dep_type="D", morphs=[JumanMorph(surface='そんな', yomi='そんな', base='*', pos_maj='指示詞', pos_min='連体詞形態指示詞', infl_type='*', infl_form='*')]),
 JumanChunk(head=3, dep_type="D", morphs=[JumanMorph(surface='こと', yomi='こと', base='*', pos_maj='名詞', pos_min='形式名詞', infl_type='*', infl_form='*'), JumanMorph(surface='は', yomi='は', base='*', pos_maj='助詞', pos_min='副助詞', infl_type='*', infl_form='*')]),
 JumanChunk(head=7, dep_type="D", morphs=[JumanMorph(surface='あり', yomi='あり', base='ある', pos_maj='動詞', pos_min='*', infl_type='子音動詞ラ行', infl_form='基本連用形'), JumanMorph(surface='ませ', yomi='ませ', base='ます', pos_maj='接尾辞', pos_min='動詞性接

In [76]:
train[0][0].morphs[-1][3]

'助詞'

In [77]:
train[0][26].morphs

[JumanMorph(surface='示した', yomi='しめした', base='示す', pos_maj='動詞', pos_min='*', infl_type='子音動詞サ行', infl_form='タ形'),
 JumanMorph(surface='。', yomi='。', base='*', pos_maj='特殊', pos_min='句点', infl_type='*', infl_form='*')]

In [79]:
train[0][0].morphs[-1][0] # 1文節目各形態素

'は'

In [114]:
def find_possible_heads(sentence, i): # i番目の文節の係り先候補番号を全て見つける
    """find all index of possible modified
    
    arg:
        sentence: a sentence
        i: index of modefier

    return:
        idx(list): index list of possible modifiee
    """
    
    idx = []
    
    if i == len(sentence)-1:
        return idx

    else:
        idx.append(i+1)
        head_next = sentence[i+1].head
        while head_next != -1:
            #print(head_next)
            idx.append(head_next)
            head_next = sentence[head_next].head
        return idx

In [143]:
find_possible_heads(train_sub[992], 3)

[4, 5, 10]

In [84]:
def all_head_word_pair(sentence):
    """Find clause pair of head word in positive data
    
    arg:
        sentence: train
    return:
        clause_pair(dict): {(clause pair):count, ...}
        
    """
    
    clause_pair = {}
    
    # check all sentence in train
    for s in sentence:
        # check clause in sentence
        for i in range(len(s)-1):
            h = s[i].head
            # check morph in i_th clause
            for j in range(len(s[i].morphs)):
                if s[i].morphs[j].pos_maj != "特殊":
                    # check morph in h_th clause
                    for k in range(len(s[h].morphs)):
                        if s[h].morphs[k].pos_maj != "特殊":
                            pair = (s[i].morphs[j].surface, s[h].morphs[k].surface)
                            if pair not in clause_pair:
                                clause_pair[pair] = 1
                            else:
                                clause_pair[pair] += 1
    return clause_pair

In [85]:
clause_pair = all_head_word_pair(train)
next(iter(clause_pair))

('村山', '示した')

In [86]:
def collect_head_word(sentence):
    """Find head word pair, appearing more three times
    arg:
        sentence: train
        
    return:
        head_word_set(set): head word pair appearing more three times
    
    """
    
    all_head_word = all_head_word_pair(sentence)
    head_word_set = set()
    
    for k, v in all_head_word.items():
        if v >= 3:
            for x in k:
                head_word_set.add(x)
    return set(head_word_set)

In [87]:
collect_head_word(train)

{'２',
 '固める',
 '理解',
 '死亡',
 '総額',
 '遺体',
 '劇場',
 '運動',
 '決定',
 'ほしい',
 '国',
 '博士',
 '本人',
 '左京',
 '控えた',
 '占めて',
 '制度',
 '調書',
 'かかって',
 '構想',
 'つけ',
 '支え',
 'せる',
 '強く',
 '関根',
 '連続',
 '決まった',
 '固まった',
 '我々',
 'つながり',
 '意欲',
 '住宅',
 '民',
 '治療',
 '悪い',
 '言い',
 '得点',
 '着陸',
 'カール',
 '貝取',
 '時間',
 '座',
 '臨み',
 '現金',
 'だから',
 '著作',
 'のである',
 '生んだ',
 'サラエボ',
 '解消',
 '持田',
 '入院',
 'ながら',
 '五十五',
 '契約',
 '残る',
 '明るみ',
 '春',
 '創案',
 '聞いた',
 '１',
 '視点',
 '描いた',
 'られ',
 '良い',
 '担保',
 'どんな',
 'そう',
 '断念',
 '設置',
 '取材',
 '要因',
 'ソマリア',
 '運ば',
 '攻撃',
 '執行',
 '手',
 '新たな',
 '支配',
 '告発',
 'ちょうど',
 'メンバー',
 '視',
 'のです',
 '九一',
 '開けて',
 'レイプ',
 '策',
 '運航',
 '分かった',
 '届く',
 '各国',
 '空爆',
 '村山',
 '年',
 '直面',
 'いえば',
 '知り合った',
 '採択',
 '期間',
 'アメリカ',
 '限ら',
 '距離',
 '分かって',
 'ＣＭ',
 '勝て',
 '元日',
 '署名',
 '確定',
 '心掛け',
 '静岡',
 'ダブルス',
 '施設',
 '午後',
 '有力',
 '歩',
 '物質',
 'すぎる',
 '司令',
 '景気',
 '証言',
 '江',
 '通常',
 '気',
 '競う',
 '会長',
 '震源',
 '結束',
 '達成',
 '納得',
 'いつか',
 'す',
 '話し',
 '取り組む',
 '掛けて',
 '四',
 'ず',
 '承認

In [88]:
len(collect_head_word(train))

3392

In [89]:
def phrase_form(sentence, i):
    """extract phrase form from i_th clause
    
    arg:
        sentence: a sentence
        i: index of modifier
        
    return:
        string: pre phrase form(surface)
    """
    
    for j in range(1, len(sentence[i].morphs)+1):
        if sentence[i].morphs[-j].pos_maj != "特殊":
            return sentence[i].morphs[-j].surface
    return sentence[i].morphs[-1].surface

In [90]:
phrase_form(train[0], 6)

'し'

In [105]:
def extract_features(sentence, i, pair_set):
    """extract features as string list

    arg:
        sentence: a sentence
        i: index of modefier
        pair(set): collect_head_word()
        

    return:
        features(list):
    
    """
    
    features = []
    heads_list = find_possible_heads(sentence, i)
    modifier = phrase_form(sentence, i)
    
    for h in heads_list:
        for k in range(len(sentence[h].morphs)):
            morphs = sentence[h].morphs[k]
            if morphs.pos_maj != "特殊":
                modified = morphs.surface
                
                if modifier and modified in pair_set:
                    pair = (modifier, modified)
                        
                    if h == sentence[i].head:
                        f = (pair, 1)
                    else:
                        f = (pair, -1)
                    features.append(f)
    return features

In [106]:
def training_data(sentence):
    """create training_data

    arg:
        sentence: train
    
    return: 
        [(features, label)]

    """
    
    data_train = []
    pair = collect_head_word(sentence)
    
    # check all sentence in train
    for s in sentence:
        #check all clause in s
        for i in range(len(s)):
            features = extract_features(s, i, pair_set)
            data_train.extend(features)
    return data_train

In [284]:
data_train = training_data(train_sub)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

(('は', '年頭'), -1)

In [285]:
data_train[0]

(('は', '年頭'), -1)

In [303]:
def get_feature_id(feature, feature_ids):
    """translate feature into index(number)
    
    arg:
        feature: 
        feature_ids
    """
    
    if feature in feature_ids:
        return feature_ids[feature]
    else:
        num = len(feature_ids)
        feature_ids[feature] = num
        return num

In [275]:
def translate_data(data):
    """Create training data. (feature represented as number)
    
    arg:
        data: data_train
    
    return:
        n_data_train: training data represented as number
        feature_ids(dict)
    """
    
    feature_ids = {} # feature name: index
    n_data_train = [] # training data represented feature_index
    
    for features, label in data:
        n_features = [] # list of feature_index
        
        for feature in features:
            feature_id = get_feature_id(feature, feature_ids)
            n_features.append(feature_id)
            
        n_data_train.append((n_features, label))

    return n_data_train, feature_ids

In [266]:
train = translate_data(data_train)

In [292]:
import json
import collections as cl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [268]:
help(LogisticRegression)

Help on class LogisticRegression in module sklearn.linear_model._logistic:

class LogisticRegression(sklearn.base.BaseEstimator, sklearn.linear_model._base.LinearClassifierMixin, sklearn.linear_model._base.SparseCoefMixin)
 |  LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
 |  
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |  
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the
 |  cross-entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag', 'saga' and 'newton-cg' solvers.)
 |  
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 's

In [330]:
n_data_train = train[0]
feature_ids = train[1]

data_train_list = []
for i in range(len(n_data_train)):
    n_data_train_dict = {}
    n_data_train_dict['features'] = n_data_train[0][0]
    n_data_train_dict['label'] = n_data_train[0][1]
    data_train_list.append(n_data_train_dict)

data_train_list[0]

{'features': [0, 1], 'label': -1}

In [328]:
# Data(n_data_train) writing to json_file
with open('BERT-dep/n_data_train.json', 'w') as f:
    json.dump(data_train_list, f, indent=4)

In [329]:
with open('BERT-dep/n_data_train.json') as f:
    for i in range(20):
        print(f.read(i), end='')

[
    {
        "features": [
            0,
            1
        ],
        "label": -1
    },
    {
        "features": [
            0,
            1
        ],
        "label": -1
    }

In [326]:
n_data_train[0][0]

[0, 1]

In [290]:
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(n_train_data, feature_ids.values())

ValueError: not enough values to unpack (expected 4, got 2)