In [2]:
import re

In [None]:
from kuromojipy.kuromoji_server import KuromojiServer
kuro_server = KuromojiServer()

In [108]:
negative_sents = """
学生じゃない。
友達じゃない。
元気じゃない。
""".splitlines()[1:]
past_sents = """
友達だった。
友達じゃなかった。
元気じゃなかった。
""".splitlines()[1:]
ha_sents = """
アリスは学生？
ジョンは明日？
今日は試験だ。
""".splitlines()[1:]
mo_sents="""
昨日も雨だった
これもそう。
出口もここだ。
""".splitlines()[1:]
sents = """それを<b>一つ</b>ください。
私は学生だ。
ソフトクリームを<b>二つ</b>ください。
カレーライスは700<b>円</b>です。
これはかなり<b>金</b>がかかった。
<b>これ</b>をください。
<b>水曜日</b>はバイトがあります。
<b>あれ</b>は何ですか。
お<b>先</b>にどうぞ。
<b>八日</b>からイギリスに行きます。
<b>そば</b>にいて下さい。
<b>こっち</b>に来て下さい。
<b>右</b>のポケットにハンカチが入っています。
この本、あなたに<b>あげます</b>。
<b>ここ</b>に本があります。
この本は<b>とても</b>おもしろい。
バナナが<b>七</b>本あります。
分かった人は<b>手</b>を上げてください。
<b>おなか</b>が空きました。
レストランは<b>空いていました</b>。"""
sents = re.sub("<b>(.*?)</b>", r"\1", sents).splitlines()

In [100]:
def pos_interpreter(pos):
    translations = {
        "名詞": "noun",
        "一般": "general",
        "助詞": "particle",
        "格助詞": "case-marking particle",
        "動詞": "verb",
        "自立": "independent",
        "助動詞": "auxilary verb",
        "記号": "symbol",
        "句点": "period",
        "代名詞": "pronoun",
        "係助詞": "binding particle",
        "数": "number",
        "接尾": "suffix",
        "助数詞": "counter",
        "副助詞": "adverbial particle",
        "形容動詞語幹": "adjectival noun stem",
        "固有名詞": "proper noun",
        "人名": "persons name",
        "名": "name",
        "サ変接続": "connecting irregular conjugation",
        "副詞可能": "adverb maybe",
        "副詞": "adverb"
        
    }
    tags = pos.split(',')
    real_tags = []
    for tag in tags:
        if tag != "*":
            if tag in translations.keys():
                real_tags.append(translations[tag])
            else:
                print(tag)
    return real_tags
print(pos_interpreter("助動詞,*,*,*"))

['auxilary verb']


In [38]:
def pos_tag(sentence):
    tagged_words = []
    kuromoji = kuro_server.kuromoji
    tokenizer = kuromoji.Tokenizer.builder().build()
    tokens = tokenizer.tokenize(sentence)
    for elem in [[x.getBaseForm() if x.getBaseForm() is not None else x.getSurfaceForm(), pos_interpreter(x.getPartOfSpeech()), x.getSurfaceForm()] for x in tokens]:
        tagged_words.append(elem)
    return tagged_words


In [15]:
pos_tag(sents[1])

[['私', ['noun', 'pronoun', 'general']],
 ['は', ['particle', 'binding particle']],
 ['学生', ['noun', 'general']],
 ['だ', ['auxilary verb']],
 ['。', ['symbol', 'period']]]

In [99]:
def encode_tags(tags):
    tag_encodings = {
        "noun": 1,
        "general": 2,
        "particle": 3,
        "case-marking particle": 4,
        "verb": 5,
        "independent": 6,
        "auxilary verb": 7,
        "symbol": 8,
        "period": 9,
        "pronoun": 10,
        "binding particle": 11,
        "number": 12,
        "suffix": 13,
        "counter": 14,
        "adverbial particle": 15,
        "adjectival noun stem": 16,
        "proper noun": 17,
        "persons name": 18,
        "name": 19,
        "connecting irregular conjugation": 20,
        "adverb maybe": 21,
        "adverb": 22,
    }
    encoded = ""
    for tag in tags:
        encoded += "".join([
            "ő", 
            tag[0], 
            ";", 
            ",".join([str(tag_encodings[x]) for x in tag[1]]),
            ";",
            tag[2],
            "ú"
        ])
    return encoded
encode_tags(pos_tag(sents[3]))

'őカレーライス;1,2;カレーライスúőは;3,11;はúő700;1,12;700úő円;1,13,14;円úőです;7;ですúő。;8,9;。ú'

In [104]:
pos_tag(mo_sents[0])

[['昨日', ['noun', 'adverb maybe'], '昨日'],
 ['も', ['particle', 'binding particle'], 'も'],
 ['雨', ['noun', 'general'], '雨'],
 ['だ', ['auxilary verb'], 'だっ'],
 ['た', ['auxilary verb'], 'た']]

In [105]:
def create_word_pattern(surface_form=None, base_form=None, tags=None):
    if surface_form is None:
        surface_form = "[^ú]+?"
    if base_form is None:
        base_form = "[^;]+?"
    if tags is None:
        tags = "[0-9,]*"
    else:
        req_tags = list(tags)
        tags = ""
        for tag in req_tags:
            tags += "([0-9]+,)*?" + str(tag) + ",?"
    return "ő" + base_form + ";" + tags + "[0-9,]*" + ";" + surface_form + "ú"
        
        
def recognize_da(encoded):
    pattern = \
            create_word_pattern(tags=[1]) + \
            create_word_pattern(base_form="だ", surface_form="だ", tags=[7])
    if re.search(pattern, encoded) is not None:
        return "Declaring using だ"

def recognize_negative_state_of_being(encoded):
    pattern = \
            create_word_pattern(tags=[1]) + \
            create_word_pattern(base_form="じゃ", surface_form="じゃ", tags=[3,15]) + \
            create_word_pattern(base_form="ない", surface_form="ない", tags=[7])
    if re.search(pattern, encoded) is not None:
        return "Negative state of being"

def recognize_past_negative(encoded):
    pattern = \
            create_word_pattern(tags=[1]) + \
            create_word_pattern(base_form="じゃ", surface_form="じゃ", tags=[3,15]) + \
            create_word_pattern(base_form="ない", surface_form="なかっ", tags=[7]) + \
            create_word_pattern(base_form="た", surface_form="た", tags=[7])
    if re.search(pattern, encoded) is not None:
        return "Past state of being"

def recognize_past_state_of_being(encoded):
    pattern = \
            create_word_pattern(tags=[1]) + \
            create_word_pattern(base_form="だ", surface_form="だっ", tags=[7]) + \
            create_word_pattern(base_form="た", surface_form="た", tags=[7])
    if re.search(pattern, encoded) is not None:
        return "Past state of being"
    
def recognize_topic_particle(encoded):
    pattern = \
            create_word_pattern(base_form="は", surface_form="は", tags=[3,11])
    if re.search(pattern, encoded) is not None:
        return "The は topic particle"
    
def recognize_inclusive_particle(encoded):
    pattern = \
            create_word_pattern(base_form="も", surface_form="も", tags=[3,11])
    if re.search(pattern, encoded) is not None:
        return "The も inclusive topic particle"

def recognize_grammar(sent):
    tagged = pos_tag(sent)
    encoded = encode_tags(tagged)
    grammars = [
        recognize_da, 
        recognize_negative_state_of_being, 
        recognize_past_state_of_being, 
        recognize_past_negative,
        recognize_topic_particle,
        recognize_inclusive_particle,
    ]
    found = [x(encoded) for x in grammars]
    found = [x for x in found if x is not None]
    return found

In [92]:
pos_tag(ha_sents[0])

[['アリス', ['noun', 'proper noun', 'persons name', 'name'], 'アリス'],
 ['は', ['particle', 'binding particle'], 'は'],
 ['学生', ['noun', 'general'], '学生'],
 ['？', ['symbol', 'general'], '？']]

In [74]:
pos_tag(past_sents[0])

[['友達', ['noun', 'general'], '友達'],
 ['だ', ['auxilary verb'], 'だっ'],
 ['た', ['auxilary verb'], 'た'],
 ['。', ['symbol', 'period'], '。']]

In [101]:
pos_tag(ha_sents[1])

[['ジョン', ['noun', 'proper noun', 'persons name', 'name'], 'ジョン'],
 ['は', ['particle', 'binding particle'], 'は'],
 ['明日', ['noun', 'adverb maybe'], '明日'],
 ['？', ['symbol', 'general'], '？']]

In [111]:
print(mo_sents[1])
print(recognize_grammar(mo_sents[1]))

これもそう。
副詞
助詞類接続
['The も inclusive topic particle']
