In [1]:
#pip install "sacrebleu[ja]"

In [1]:
from typing import Union
from itertools import combinations
import fkassim.FastKassim as fkassim
import ginza
from ginza import lemma_, bunsetu, sub_phrases
import spacy
from nltk import Tree

In [2]:
from nltk.translate.bleu_score import sentence_bleu
#from nltk import word_tokenize
import numpy as np
import ginza
import spacy
from tqdm import tqdm
#import sacrebleu
import nltk

In [3]:
def compute_length_measures(sentences: list):
    """Given the sentence list (of strings), computes the average sentence length. In characters
    Should I also do it with tokens/words/bunsetsus?
    :return: avg_length, stddev"""
    sentence_length_list = [len(sent) for sent in sentences]
    return np.mean(sentence_length_list), np.std(sentence_length_list)

In [4]:
# code from he et al. https://github.com/NLPCode/CDEG/blob/master/evaluations/automatic_evaluation.py
def compute_distances_and_entropy(tokenized_sentences_list, ngrams=[1, 2, 3, 4], num_tokens=0):
        """
        this function is used to calculate the percentage of unique n-grams to measure the generation diversity.
        this function is also used to calculate entropy to measure the generation diversity.
        :param tokenized_sentences_list:
        :param ngrams:
        :param num_tokens:
        :return: the percentage of unique n-grams and entropy of their distribution
        """
        distances = []
        entropies = []
        if num_tokens > 0:
            cur_num = 0
            new_tokenized_sentences_list = []
            for tokenized_sentence in tokenized_sentences_list:
                cur_num += len(tokenized_sentence)
                new_tokenized_sentences_list.append(tokenized_sentence)
                if cur_num >= num_tokens:
                    break
            tokenized_sentences_list = new_tokenized_sentences_list

        for n in ngrams:
            # calculate (n-gram, frequency) pairs
            ngram_fdist = nltk.FreqDist()
            for tokens in tokenized_sentences_list:
                ngrams = nltk.ngrams(tokens, n)
                ngram_fdist.update(ngrams)
            unique = ngram_fdist.B()  # the number of unique ngrams
            total = ngram_fdist.N()  # the number of ngrams
            distances.append(unique * 1.0 / total)
            # calculate entropies
            ans = 0.0
            for k, v in ngram_fdist.items():
                ans += v * np.log(v * 1.0 / total)
            ans = -ans / total
            entropies.append(ans)
        return distances, entropies

In [5]:
def compute_self_BLEU(tokenized_sentence_list, weights=[0.25, 0.25, 0.25, 0.25]):
    """Computes BLEU score considering each sentence as hypotesis and the others as references, then averages the scores.
    input: the list of sentences,
    weights: the weights to give n-grams
    ---
    output: the bleu score."""

    scores = []
    for i in range(len(tokenized_sentence_list)):
        hypothesis = tokenized_sentence_list[i]
        references = tokenized_sentence_list[:i] + tokenized_sentence_list[i+1:]
        score = sentence_bleu(hypothesis=hypothesis, references=references, weights=weights)
        scores.append(score)
    return np.mean(scores)
    

In [60]:
def bunsetsu_sub(token):
    """Performs substitution similar to Tolmachev et al."""
    result = []
    # bunsetsu_head_list = list(ginza.bunsetu_head_tokens(token.doc))
    bunsetsu_span = ginza.bunsetu_span(token)
    # return [t.lemma_ if t not in bunsetsu_head_list else t.pos_ for t in bunsetsu_span]
    # if token not in bunsetsu_head_list:
    #     return token.lemma_
    # else: return token.pos_
    for t in bunsetsu_span:
        # skip punctuation
        if t.is_punct:
            continue
        # if t  the head word of the bunsetsu, or if it depends on a right token from the same bunsetsu
        if ginza.is_bunsetu_head(t) or (ginza.head(t) == token and t in ginza.lefts(token)):
            result.append(t.pos_)
        else: result.append(t.lemma_)
    return "+".join(result)

def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def build_nltk_tree_from_ginza_rec(token, bunsetsu_tokens, node_repr):
    # if empty, just print token
    #print(token, list(token.children))
    # to avoid repetitions, use only one bunsetsu once
    selected_children = intersection(token.children, bunsetsu_tokens)

    if not selected_children:
        return node_repr(token)
    
    # else recursively do it again
    # ordering problem, this way they are ordered
    children_trees = [build_nltk_tree_from_ginza_rec(child_token, bunsetsu_tokens, node_repr=node_repr)
                       for child_token in selected_children]
    return Tree(node_repr(token), children_trees)

def build_nltk_tree_from_ginza(sentence: Union[str,spacy.tokens.doc.Doc, spacy.tokens.span.Span], nlp=None,
                                node_repr:callable(spacy.tokens.token.Token)=bunsetsu_sub):
    """Given a string, parses it and returns a nltk Tree. It will then be used by the fastkassim method.
    Otherwise, a doc (span) that has already been sentencised can be passed. Otherwise it will be sentencised.
    Input:
    sentence: 
    nlp: Optional
    node_repr: the function to apply to each token (eg use the full bunsetsu or do substitutions)

    returns: nlkt.Tree object"""
    doc = sentence
    if type(sentence) is str:
        if nlp is None:
            raise ValueError(f"If string object you need to pass an nlp spacy object, not {nlp}")
        doc = nlp(sentence)
        doc = list(doc.sents)[0]
    elif type(sentence) is spacy.tokens.doc.Doc:
        doc = list(doc.sents)[0]
    
    root = doc.root
    spans = ginza.bunsetu_spans(doc)
    bunsetsu_tokens = [span.root for span in spans]
    return build_nltk_tree_from_ginza_rec(token=root, bunsetsu_tokens=bunsetsu_tokens, node_repr=node_repr)

def nltk_spacy_tree(sent, nlp):
    """
    Visualize the SpaCy dependency tree with nltk.tree
    """
    doc = nlp(sent)
    def token_format(token):
        return "_".join([token.pos_, token.dep_])

    def to_nltk_tree(node):
        if node.n_lefts + node.n_rights > 0:
            return Tree(token_format(node),
                       [to_nltk_tree(child) 
                        for child in node.children]
                   )
        else:
            return token_format(node)

    tree = [to_nltk_tree(sent.root) for sent in doc.sents]
    # The first item in the list is the full tree
    return tree[0]


# now we need to do comparisons for lists of sentences
def compute_fastkassim_similarity(sentences: list, nlp):
    """Given a a list of sentences, will produce the trees and perform pairwise comparisons and take the avg"""
    FastKassim = fkassim.FastKassim(fkassim.FastKassim.LTK)
    s_trees = []
    sims = []

    for s in sentences:
        s_trees.append(nltk_spacy_tree(s, nlp=nlp))
    
    for s1, s2 in combinations(s_trees,2):
        sims.append(FastKassim.compute_similarity_preparsed(s1,s2))
    return np.mean(sims)


In [9]:
# use spacy to get different tokenizations
nlp = spacy.load('ja_ginza')



In [10]:
# Sample sentences with the word 'taberu' (食べる)
sentences = [
    "私は毎朝パンを食べる。",  # I eat bread every morning.
    "猫が魚を食べるのが好きです。",  # The cat likes to eat fish.
    "彼女はレストランでピザを食べるつもりです。",  # She plans to eat pizza at the restaurant.
    "昨日、私たちは外でラーメンを食べました。",  # Yesterday, we ate ramen outside.
    "彼は急いでサンドイッチを食べる。",  # He eats a sandwich in a hurry.
    "あの子はケーキを食べるのが得意です。",  # That child is good at eating cake.
    "私達は一緒に夕食を食べることにしました。",  # We decided to eat dinner together.
    "彼女は健康のためにサラダを食べる。",  # She eats salad for her health.
    "犬は何でも食べる。",  # Dogs eat anything.
    "夏にはよくアイスクリームを食べる。"  # I often eat ice cream in the summer.
]

#sentences = ["犯人が捕まったという話は滅多に聞かない。", "私は毎朝パンを食べる。"]

def get_tokenized_sentences(sentences: str) -> list:
    """returns a list tokenized sentences (list of list of str)"""
    return [[token.text for token in nlp.tokenizer(sent)] for sent in sentences]
# Tokenizing the sentences (assuming each sentence is already properly segmented in Japanese)
# keep only 'morphologizer'
#sentences_docs = nlp.pipe(sentences, disable=['tok2vec','parser','senter','attribute_ruler','ner'])
#sentences_docs = nlp.pipe(sentences)
tokenized_sentences = get_tokenized_sentences(sentences)
print(tokenized_sentences)
# using nlp.tokenizer is faster..

[['私', 'は', '毎朝', 'パン', 'を', '食べる', '。'], ['猫', 'が', '魚', 'を', '食べる', 'の', 'が', '好き', 'です', '。'], ['彼女', 'は', 'レストラン', 'で', 'ピザ', 'を', '食べる', 'つもり', 'です', '。'], ['昨日', '、', '私たち', 'は', '外', 'で', 'ラーメン', 'を', '食べ', 'まし', 'た', '。'], ['彼', 'は', '急い', 'で', 'サンドイッチ', 'を', '食べる', '。'], ['あの', '子', 'は', 'ケーキ', 'を', '食べる', 'の', 'が', '得意', 'です', '。'], ['私達', 'は', '一緒', 'に', '夕食', 'を', '食べる', 'こと', 'に', 'し', 'まし', 'た', '。'], ['彼女', 'は', '健康', 'の', 'ため', 'に', 'サラダ', 'を', '食べる', '。'], ['犬', 'は', '何', 'で', 'も', '食べる', '。'], ['夏', 'に', 'は', 'よく', 'アイスクリーム', 'を', '食べる', '。']]


In [61]:
kassim_sentences =['田中さんはカレーを食べました。', # this should be similar to the second
    '犬は水を飲んだ。' ,# this should be similar to the first
    '見たことがないです。' ,# this should be not similar to anything
    '可愛い田中さんはカレーを食べました。' ]
kassim_sentences_no_reorder =['可愛い田中さんはカレーを食べました。', 
    'カレーを可愛い田中さんは食べました。' ]
kassim_sentences_reorder =['田中さんはカレーを食べました。', 
    'カレーを田中さんは食べました。' ]
print('init test', compute_fastkassim_similarity(kassim_sentences, nlp))
print('no reorder',compute_fastkassim_similarity(kassim_sentences_no_reorder, nlp))
print('reorder',compute_fastkassim_similarity(kassim_sentences_reorder, nlp))

init test 0.7583333333333333
no reorder 0.7677973956662546
reorder 1.0


In [59]:
FastKassim = fkassim.FastKassim(fkassim.FastKassim.LTK)
FastKassim.params

{'average': False, 'sigma': 1, 'lmbda': 0.4, 'use_new_delta': True}

In [21]:
# calculating avg sentence length
avg_length, std = compute_length_measures(sentences)
print('avg length', avg_length, '+-', std)
# Calculating self-BLEU
self_bleu_score = compute_self_BLEU(tokenized_sentences)
print('self bleu', self_bleu_score)
# calculating unique n-grams and entropies
ngrams_entropies = compute_distances_and_entropy(tokenized_sentences)
print('distances (unique ngrams) - entropies:', ngrams_entropies)

avg length 16.3 +- 3.7429934544425802
self bleu 0.06023724351635462
distances (unique ngrams) - entropies: ([0.5, 0.7906976744186046, 0.9210526315789473, 0.9848484848484849], [3.444685034297525, 4.048416708262656, 4.203048333341079, 4.168650282009455])


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [22]:
ngrams_test = [['sono', 'ehy', "ciao", 'mamma'],['mamma', 'ehy','ciao', '.']]
compute_distances_and_entropy(ngrams_test)

([0.625, 0.8333333333333334, 1.0, 1.0],
 [1.559581156259877,
  1.5607104090414063,
  1.3862943611198906,
  0.6931471805599453])

In [62]:
# test using diverse sentences

def test_self_bleu():
    """Ten examples. It should be that the diverse sentences have lower bleu score, and the opposite for the similar"""
    diverse_sentences = [
        "毎朝、鳥の声を聞くのが日課です。", #(Listening to the birds' voices every morning is my daily routine.)
        "海の音を聞くと、心が落ち着きます。", #(Hearing the sound of the sea calms my mind.)
        "彼はラジオでジャズを聞くのが好きです。", #(He likes listening to jazz on the radio.)
        "私たちは山で風の音を聞きながらキャンプしました。", #(We camped in the mountains, listening to the sound of the wind.)
        "彼女は異文化について聞くことに興味があります。", #(She is interested in hearing about different cultures.)
        "その古い物語を聞いて、子供たちはワクワクしました。", #(The children were excited to hear the old tale.)
        "昨夜、遠くの雷の音を聞いて、驚きました。", #(Last night, I was surprised to hear the distant thunder.)
        "彼は外国語を聞いてもすぐに理解できます。", #(He can understand foreign languages just by listening.)
        "私は彼がピアノを弾くのを聞きに行きます。", #(I'm going to listen to him play the piano.)
        "その詩を聞くたびに、新しい発見があります。", #(Every time I hear that poem, I discover something new.)
    ]
    similar_sentences = [
        "彼に明日の予定を聞きました。",  # I asked him about tomorrow's schedule.            
        "彼女から話を聞きました。",     # I heard a story from her.
        "友達が彼の意見を聞いた。",     # My friend asked for his opinion.
        "先生に質問を聞いてみました。",  # I tried asking the teacher a question.
        "彼らはそのニュースを聞いて驚いた。",  # They were surprised to hear the news.
        "私は彼のアドバイスを聞くつもりです。",  # I intend to listen to his advice.
        "兄がその曲を聞いて感動した。",      # My brother was moved when he listened to that song.
        "彼女は子供たちから話を聞くのが好きです。",  # She likes to hear stories from the children.
        "彼は先生に授業の詳細を聞きました。",      # He asked the teacher for details of the lesson.
        "彼らはニュースを聞いて心配している。"     # They are worried after hearing the news.
    ]
    #print(len(similar_sentences), len(diverse_sentences))
    result_diverse = compute_self_BLEU(get_tokenized_sentences(diverse_sentences))
    result_similar = compute_self_BLEU(get_tokenized_sentences(similar_sentences))
    result_all = compute_self_BLEU(get_tokenized_sentences(similar_sentences+diverse_sentences))
    print('SELFBLEU results (lower is better diversity) \ndiverse:', result_diverse, 'similar:', result_similar, 'all:', result_all)

test_self_bleu()

SELFBLEU results (lower is better diversity) 
diverse: 0.1984155389876064 similar: 0.21653923593362454 all: 0.25870440709309006


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [63]:
# fastkassim test
def test_fastkassim():
    # need to make them similar with chatgpt or something
    # diverse syntactical structures with the word "taberu"
    diverse_sentences =[
    "食べることは楽しいです。",  # Eating is fun.
    "このりんごを食べてもいいですか？",  # May I eat this apple?
    "彼女にケーキを食べさせました。",  # I let her eat cake.
    "食べた後で、彼は眠りました。",  # After eating, he slept.
    "食べながら、テレビを見ます。",  # I watch TV while eating.
    "食べたくないのですが。",  # I don't want to eat.
    "食べ物を見ると、お腹が空きます。",  # Seeing food makes me hungry.
    "昨日食べたピザは美味しかったです。",  # The pizza I ate yesterday was delicious.
    "彼は食べる前に手を洗います。",  # He washes his hands before eating.
    "食べるのを忘れてしまった。" ] # I forgot to eat.

    # similar syntactical structure with the word "taberu" (to eat)
    similar_sentences =  [
    "私は毎朝パンを食べます。",  # I eat bread every morning.
    "彼はよく寿司を食べます。",  # He often eats sushi.
    "彼女は昨日ケーキを食べました。",  # She ate cake yesterday.
    "私たちは一緒にリンゴを食べます。",  # We eat apples together.
    "彼は毎晩スープを食べます。",  # He eats soup every night.
    "犬は毎日ドッグフードを食べます。",  # The dog eats dog food every day.
    "彼女は週末にピザを食べます。",  # She eats pizza on weekends.
    "私は時々サラダを食べます。",  # I sometimes eat salad.
    "子供たちは昼食にハンバーガーを食べます。",  # The children eat hamburgers for lunch.
    "彼はたまにチョコレートを食べます。"] # He occasionally eats chocolate.

    result_similar = compute_fastkassim_similarity(similar_sentences, nlp)
    result_diverse = compute_fastkassim_similarity(diverse_sentences, nlp)
    result_all = compute_fastkassim_similarity(diverse_sentences+similar_sentences, nlp)
    print('fastkassim results on syntactic similarity (lower is better diversity) \ndiverse:', result_diverse, 'similar:', result_similar, 'all:', result_all)

test_fastkassim()

fastkassim results on syntactic similarity (lower is better diversity) 
diverse: 0.48830575843830915 similar: 0.7362962962962963 all: 0.5525249520292133


In [2]:
# syntactic parse trees from tolmachev https://megagon.ai/ginza-version-4-0-improving-syntactic-structure-analysis-through-japanese-bunsetsu-phrase-extraction-api-integration/
# chatgpt bad code


# import spacy
# import networkx as nx

# class SyntacticSimilarityModel:
#     def __init__(self, nlp = None):
#         """Initialize with nlp object (usually ja_ginza_electra) or (ja_core_news_lg)"""
#         if nlp:
#             self.nlp = nlp
#         else: self.nlp = spacy.load("ja_ginza_electra")

#     def parse_sentence(self, sentence):
#         # Parse the sentence to extract its dependency tree and POS tags
#         doc = self.nlp(sentence)
#         return doc

#     def simplify_tree(self, doc):
#         # Replace lexical items with POS tags, retain function words
#         # Implementation depends on specific requirements
#         pass

#     def generate_subtrees(self, doc, target_word, max_size=3):
#         # Generate subtrees from the parse tree
#         # This function will likely be the most complex to implement
#         pass

#     def expand_feature_space(self, subtrees):
#         # Handle compound nouns and multi-unit lexical items
#         pass

#     def vectorize_subtrees(self, subtrees):
#         # Convert subtrees to vector representation
#         pass

#     def compute_similarity(self, vector1, vector2):
#         # Compute syntactic similarity using a graphlet-based approach
#         # Example: Cosine similarity, Jaccard index, etc.
#         pass

#     def get_syntactic_similarity(self, sentence1, sentence2, target_word):
#         doc1 = self.parse_sentence(sentence1)
#         doc2 = self.parse_sentence(sentence2)

#         simplified_tree1 = self.simplify_tree(doc1)
#         simplified_tree2 = self.simplify_tree(doc2)

#         subtrees1 = self.generate_subtrees(simplified_tree1, target_word)
#         subtrees2 = self.generate_subtrees(simplified_tree2, target_word)

#         expanded_subtrees1 = self.expand_feature_space(subtrees1)
#         expanded_subtrees2 = self.expand_feature_space(subtrees2)

#         vector1 = self.vectorize_subtrees(expanded_subtrees1)
#         vector2 = self.vectorize_subtrees(expanded_subtrees2)

#         return self.compute_similarity(vector1, vector2)

# # Example usage
# model = SyntacticSimilarityModel(language='en')
# similarity = model.get_syntactic_similarity("He is a fast runner", "She is a slow runner", "runner")
# print(f"Syntactic similarity: {similarity}")


In [40]:
s = '田中さんはカレーを食べました。'
s = '私は毎日青い自転車を京都大学まで漕ぎます。' # from the example of tolmachev
s = '可愛い田中さんはカレーを食べました。' 
doc = nlp(s)
doc = list(doc.sents)[0] # to get the root

In [41]:
# bunsetsu extraction pos is the english name, tag is the japanese one
bunsetsu_list = ginza.bunsetu_spans(doc)
print(bunsetsu_list)
# erasure
# first must split?
bunsetsu_head_list = ginza.bunsetu_head_tokens(doc)
for t in bunsetsu_head_list:
    # should change the tokens in the head with their Pos.
    # here we have still distinction between noun and proper noun.
    #span_pos = [t.pos_ for t in span]
    # using t.pos_ instead of t.text or span.text

    print(t.pos_, t.text)
    #print(span.label_, span.text, span_pos)

bunsetsu_func_list = ginza.bunsetu_head_tokens(doc)
print(bunsetsu_func_list)


[t.text if t not in bunsetsu_head_list else t.pos_ for t in doc]

[可愛い, 田中さんは, カレーを, 食べました。]
ADJ 可愛い
NOUN さん
NOUN カレー
VERB 食べ
[可愛い, さん, カレー, 食べ]


['ADJ', '田中', 'NOUN', 'は', 'NOUN', 'を', 'VERB', 'まし', 'た', '。']

In [34]:
def bunsetsu_sub(token):
    """Performs substitution similar to Tolmachev et al."""
    bunsetsu_head_list = list(ginza.bunsetu_head_tokens(token.doc))
    if token not in bunsetsu_head_list:
        return token.lemma_
    else: return token.pos_

In [42]:
list(doc)

[可愛い, 田中, さん, は, カレー, を, 食べ, まし, た, 。]

In [57]:
token = doc[2]
ginza.is_bunsetu_head(token)
list(ginza.rights(token))

[は]

In [58]:
span = ginza.bunsetu_span(token)
ginza.head(token)

食べ

In [30]:
print('tree root node: ', doc.root, ginza.bunsetu(doc.root), ginza.phrase(doc.root))
for rel, sb in ginza.sub_phrases(doc.root, ginza.bunsetu):
    print('node: ', sb, ginza.bunsetu(sb), ginza.phrase(sb))
    print(rel, sb)

tree root node:  漕ぎ 漕ぎ+ます+。 漕ぎ
node:  私+は <function traverse.<locals>.<lambda> at 0x7f6543ef9310> <function traverse.<locals>.<lambda> at 0x7f6543ef93a0>
nsubj 私+は
node:  毎日 <function traverse.<locals>.<lambda> at 0x7f6543ef9310> <function traverse.<locals>.<lambda> at 0x7f6543ef93a0>
advmod 毎日
node:  自転車+を <function traverse.<locals>.<lambda> at 0x7f6543ef9310> <function traverse.<locals>.<lambda> at 0x7f6543ef93a0>
obj 自転車+を
node:  京都大学+まで <function traverse.<locals>.<lambda> at 0x7f6543ef9310> <function traverse.<locals>.<lambda> at 0x7f6543ef93a0>
obl 京都大学+まで


In [27]:
from spacy import displacy

# Visualize the dependency parse
displacy.render(doc, style="dep")