In [1]:
from kuromojipy.kuromoji_server import KuromojiServer
kuro_server = KuromojiServer()
from japanese_text_extractor.japanese_text_extractor import extract_japanese_text
from functools import reduce
from grammar_recognizer import encode_tags
import romkan

In [3]:
import importlib
import concept_mapper
importlib.reload(concept_mapper)

mapper = concept_mapper.ConceptMapper(kuro_server, "../id_reference.csv", "293", "490")




AttributeError: 'NoneType' object has no attribute 'group'

In [13]:
sents = {}
with open("sentences.csv", "r") as file:
    next_line = "asdf"
    while next_line != "":
        next_line = file.readline()
        splited = next_line.split("\t")
        if len(splited) > 1 and splited[1] in ["jpn", "eng"]:
            sents[splited[0]] = splited[1:]
        if next_line == "":
            print("end!")
            break

end!


In [14]:
translations = []
with open("jpn_indices.csv", "r") as file:
    next_line = "asdf"
    while next_line != "":
        next_line = file.readline()
        splited = next_line.split("\t")
        if len(splited) == 3:
            jp_id, eng_id, text = splited
            if jp_id in sents and eng_id in sents:
                translations.append([sents[jp_id][1].strip(), sents[eng_id][1].strip(), text])

In [9]:
def count_missing():
    missing = {}
    good = 0
    for sent in all_sents[:500]:
        result = mapper.analyze_sent(sent)
        if result["missing"] != []:
            for elem in result["missing"]:
                if elem[0] == "から":
                    print(sent)
                if elem[0] in missing:
                    missing[elem[0]] += 1
                else:
                    missing[elem[0]] = 1
        else:
            good += 1
    print(good/500)
    print(list(sorted([(x, missing[x]) for x in missing.keys()], key=lambda x: x[1], reverse=True)))

In [11]:
def sent_concepts_to_id(concept):
    if type(concept) == concept_mapper.WordConcept and type(concept.produce) == list:
        return [x.id for x in concept.read]
    return [int(concept.read.id) if type(concept) == concept_mapper.WordConcept else int(concept.id)]

def concept_to_exp(exp):
    jap_texts = extract_japanese_text(exp)
    results = [mapper.analyze_sent(x) for x in jap_texts]
    concepts = reduce(lambda acc, curr_value: acc + [*curr_value["found_words"], *curr_value["found_grammars"]], results, [])
    return list(set(concepts))

def concepts_to_sent(results):
    return reduce(lambda acc, curr: acc + [str(x) for x in sent_concepts_to_id(curr)], [*results["found_words"], *results["found_grammars"]], list())

In [19]:
def concepts_to_id(concepts):
    return [int(x.read.id) if type(x) == concept_mapper.WordConcept else int(x.id) for x in concepts]

def create_exp_csv():
    exp = open("exp.txt", "r").read()
    exp = exp.replace("\n", " ").replace(";", ",")
    concepts = concept_to_exp(exp)
    concept_ids = concepts_to_id(concepts)
    with open("exp.csv", "w+") as file:
        file.write(exp+";"+" ".join(concept_ids))

In [21]:
print(concepts_to_id(concept_to_exp("時間がなかったからパーティーに行きませんでした。")))

[{'name': 'The が identifier particle', 'coords': [{'start': 1, 'end': 3}]}, {'name': 'i-adjectives', 'coords': [{'start': 2, 'end': 4}]}, {'name': 'i-adjectives/conjugation', 'coords': [{'start': 2, 'end': 5}]}, {'name': 'verbs', 'coords': [{'start': 0, 'end': 1}]}, {'name': 'The target に particle', 'coords': [{'start': 6, 'end': 8}]}, {'name': 'using masu', 'coords': [{'start': 8, 'end': 10}]}]
[33558, 45486, 34390, 45539, 45568, 45512, 33844]


In [74]:
def create_sent_cards_csv():
    sents = [[y.strip() for y in x.split(";")] for x in open("sents.csv", "r").readlines()]
    lines = []
    for line in sents[:100]:
        results = mapper.analyze_sent(line[0])
        if results["missing"] == []:
            lines.append(["How would you say in Japanese? <br /><h1>{}</h1>".format(line[1]), "<h1>" + line[0] + "</h1>", " ".join(concepts_to_sent(results))])
    with open("outcards.csv", "w+") as file:
        file.write("\n".join([";".join(x) for x in lines]))

In [25]:
def create_many_sent_cards():
    sents = translations
    lines = []
    i = 0
    for line in sents[:10000]:
        i += 1
        try:
            results = mapper.analyze_sent(line[0])
            if results["missing"] == []:
                lines.append(["What does this mean? <br /><h1>{}</h1>".format(line[0]), "<h1>" + line[1] + "</h1><span>({})</span>".format("".join([romkan.to_hiragana(romkan.to_roma(x[-1])) for x in mapper.pos_tag(line[0])])), " ".join(concepts_to_sent(results))])
        except:
            print(line)
    with open("outcards_3.csv", "w+") as file:
        file.write("\n".join([";".join(x) for x in lines]))

In [102]:
def show_text_in_color(text, color):
    return "<text style=color:{}>{}</text>".format(color, text)

In [121]:
def show_analyzed_sent(sent):
    pos_tags = mapper.pos_tag(sent)
    analyzed = mapper.analyze_sent(sent)
    words = [x[2] for x in pos_tags]
    parts = [(x[0], x[1], x[1] + 1) for x in analyzed['verbose_found_words']] + [(x[0], x[1][0]['start'], x[1][0]['end']) for x in analyzed['verbose_found_grammars']]
    parts = [x for x in parts if x[1] != x[2]]
    colors = ["red", "green", "blue", "purple"]
    i = 0
    for part in parts:
        words[part[1]:part[2]] = [show_text_in_color(x, colors[i%len(colors)]) for x in words[part[1]:part[2]]]
        parts[i] = [*part, colors[i%len(colors)]]
        i += 1
    display(html_print("<h1>" + "".join(words) + "</h1><br />" + "<br />".join(["{} - {}".format(x[0], show_text_in_color(x[3], x[3])) for x in parts])))

In [83]:
from IPython.display import HTML as html_print
from IPython.display import display