# MEに基づくモデルを用いた日本語係り受け解析

In [188]:
train, dev, test = load_kyoto_corpus()

In [254]:
def has_loop(s):
    for i in range(len(s)):
        if s[i].head == i:
            return True
    return False

In [255]:
train_sub = [s for s in train if not has_loop(s)]

In [114]:
def find_possible_heads(sentence, i):
    """find all index of possible modified
    
    arg:
        sentence: a sentence
        i: index of modefier

    return:
        idx(list): index list of possible modifiee
    """
    
    idx = []
    
    if i == len(sentence)-1:
        return idx

    else:
        idx.append(i+1)
        head_next = sentence[i+1].head
        while head_next != -1:
            #print(head_next)
            idx.append(head_next)
            head_next = sentence[head_next].head
        return idx

In [84]:
def all_head_word_pair(sentence):
    """Find clause pair of head word in positive data
    
    arg:
        sentence: train
    return:
        clause_pair(dict): {(clause pair):count, ...}
        
    """
    
    clause_pair = {}
    
    # check all sentence in train
    for s in sentence:
        # check clause in sentence
        for i in range(len(s)-1):
            h = s[i].head
            # check morph in i_th clause
            for j in range(len(s[i].morphs)):
                if s[i].morphs[j].pos_maj != "特殊":
                    # check morph in h_th clause
                    for k in range(len(s[h].morphs)):
                        if s[h].morphs[k].pos_maj != "特殊":
                            pair = (s[i].morphs[j].surface, s[h].morphs[k].surface)
                            if pair not in clause_pair:
                                clause_pair[pair] = 1
                            else:
                                clause_pair[pair] += 1
    return clause_pair

In [86]:
def collect_head_word(sentence):
    """Find head word pair, appearing more three times
    arg:
        sentence: train
        
    return:
        head_word_set(set): head word pair appearing more three times
    
    """
    
    all_head_word = all_head_word_pair(sentence)
    head_word_set = set()
    
    for k, v in all_head_word.items():
        if v >= 3:
            for x in k:
                head_word_set.add(x)
    return set(head_word_set)

In [89]:
def phrase_form(sentence, i):
    """extract phrase form from i_th clause
    
    arg:
        sentence: a sentence
        i: index of modifier
        
    return:
        string: pre phrase form(surface)
    """
    
    for j in range(1, len(sentence[i].morphs)+1):
        if sentence[i].morphs[-j].pos_maj != "特殊":
            return sentence[i].morphs[-j].surface
    return sentence[i].morphs[-1].surface

In [105]:
def extract_features(sentence, i, pair_set):
    """extract features as string list

    arg:
        sentence: a sentence
        i: index of modefier
        pair(set): collect_head_word()
        

    return:
        features(list):
    
    """
    
    features = []
    heads_list = find_possible_heads(sentence, i)
    modifier = phrase_form(sentence, i)
    
    for h in heads_list:
        for k in range(len(sentence[h].morphs)):
            morphs = sentence[h].morphs[k]
            if morphs.pos_maj != "特殊":
                modified = morphs.surface
                
                if modifier and modified in pair_set:
                    pair = (modifier, modified)
                        
                    if h == sentence[i].head:
                        f = (pair, 1)
                    else:
                        f = (pair, -1)
                    features.append(f)
    return features

In [106]:
def training_data(sentence):
    """create training_data

    arg:
        sentence: train
    
    return: 
        [(features, label)]

    """
    
    data_train = []
    pair = collect_head_word(sentence)
    
    # check all sentence in train
    for s in sentence:
        #check all clause in s
        for i in range(len(s)):
            features = extract_features(s, i, pair_set)
            data_train.extend(features)
    return data_train

In [None]:
data_train = training_data(train_sub)

In [285]:
data_train[0]

(('は', '年頭'), -1)

In [303]:
def get_feature_id(feature, feature_ids):
    """translate feature into index(number)
    
    arg:
        feature: 
        feature_ids
    """
    
    if feature in feature_ids:
        return feature_ids[feature]
    else:
        num = len(feature_ids)
        feature_ids[feature] = num
        return num

In [275]:
def translate_data(data):
    """Create training data. (feature represented as number)
    
    arg:
        data: data_train
    
    return:
        n_data_train: training data represented as number
        feature_ids(dict)
    """
    
    feature_ids = {} # feature name: index
    n_data_train = [] # training data represented feature_index
    
    for features, label in data:
        n_features = [] # list of feature_index
        
        for feature in features:
            feature_id = get_feature_id(feature, feature_ids)
            n_features.append(feature_id)
            
        n_data_train.append((n_features, label))

    return n_data_train, feature_ids

In [266]:
train = translate_data(data_train)

In [292]:
import json
import collections as cl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
n_data_train = train[0]
feature_ids = train[1]

In [None]:
def n_data_train_to_json(n_data_train):
    """
    Data(n_data_train) writing to json_file
    """
    
    data_train_list = []
    
    for i in range(len(n_data_train)):
        n_data_train_dict = {}
        n_data_train_dict['features'] = n_data_train[0][0]
        n_data_train_dict['label'] = n_data_train[0][1]
        data_train_list.append(n_data_train_dict)
        
    with open('BERT-dep/n_data_train.json', 'w') as f:
        json.dump(data_train_list, f, indent=4)

In [None]:
n_data_train_to_json(n_data_train)

In [None]:
def feature_id_to_json(feature_ids):
    """
    Data(feature_ids) writing to json_file
    """
    
    with open('BERT-dep/feature_ids.json', 'w') as f:
        json.dump(feature_ids, f, indent=4)

In [None]:
feature_id_to_json(feature_ids)