In [1]:
import utils
from utils import *
import pandas as pd
import numpy as np
import random
import time

In [2]:
# this is code to make a more useful df(s) for evaluation
# we read the sentences from files which include test words and make a big df with useful columns
utils.fix_reproducibility()

corpus = load_dataset("bennexx/jp_sentences")
corpus = corpus['train'].to_pandas()

target_filename = '/data/enrico_benedetti/nihongoexample/data/targets/target_words.csv'
system_dirs = ['/data/enrico_benedetti/nihongoexample/evaluation/outputs/retrieval/', '/data/enrico_benedetti/nihongoexample/evaluation/outputs/generation/llm_jp/', '/data/enrico_benedetti/nihongoexample/evaluation/outputs/generation/chatgpt/']
output_dir = "/data/enrico_benedetti/nihongoexample/evaluation/outputs/all"

df_target = pd.read_csv(target_filename)

In [3]:
target_levels = ['N1', 'N3', 'N5']
dfs = []
k=5
for target_level in target_levels:
    
    for system_id, system_dir in enumerate(system_dirs, start=1):

        for i, data in df_target.iterrows():
        
            target_word = data['target_word']
            context_sentence = data['context_sentence']
            # read the file which is like sys_dir + tw_tl_.csv
            sentence_file = f"{system_dir}{target_word}_{target_level}_.csv"
            # assign also split if in val or test based on the df_target
            
            try:
                df = pd.read_csv(sentence_file) # damn retrieval is different format
                # remove context sentence from sentence column... will need to change the scoring function... or at least the output.
                # actually no, all systems have it in first position, just the other column types that is different
                df['system_id'] = system_id
                df['context_sentence'] = context_sentence
                df['target_word'] = target_word
                df['target_level'] = target_level
                if data['is_test_reduced']:
                    df['split'] = 'test'
                else: df['split'] = 'val'
                #df.to_csv(sentence_file, index=False)
                dfs.append(df.loc[1:k])
            except FileNotFoundError as e:
                # pass
                print(e)

combined_df = pd.concat(dfs, ignore_index=True)

In [7]:
human_eval_only = combined_df['split'] == 'test'
combined_df[human_eval_only].to_csv('/data/enrico_benedetti/nihongoexample/data/targets/combined_human_eval', index=False)

In [5]:
lens = [len(df) for df in dfs]

In [6]:
df_info = pd.DataFrame({'len': lens})
df_info.value_counts()

len
5      553
1        4
4        4
2        3
3        3
Name: count, dtype: int64

In [7]:
# check that the generated outputs are not in the dataset

In [8]:
dup = combined_df['sentence'].isin(corpus['sentence'])
sys1 = combined_df['system_id'] == 1
sys2 = combined_df['system_id'] == 2
sys3 = combined_df['system_id'] == 3
pretty_subset = ['split','sentence','target_word', 'target_level', 'context_sentence','system_id']

In [9]:
print('Outputs which are also present in the corpus:')
print(combined_df[dup].value_counts(subset=['system_id','split']))

Outputs which are also present in the corpus:
system_id  split
1          val      795
           test     150
2          val        1
3          val        1
Name: count, dtype: int64


In [10]:
print('Generation outputs which are also present in the corpus:')
display(combined_df[dup & ~sys1][pretty_subset])
# which source are they from
corpus_source = load_dataset("bennexx/jp_sentences", "sources", split='train').to_pandas()
corpus_source = corpus_source.rename(columns={'Unnamed: 0': 'start'})
# add source back to the corpus
for i, source_id in enumerate(corpus_source['start'].tolist()):
    corpus.loc[source_id, 'source'] = corpus_source.loc[i,'source']
corpus['source'] = corpus['source'].ffill()

print("Sources for those sentences")
corpus_dup = corpus['sentence'].isin(combined_df.loc[dup & ~sys1, 'sentence'])
display(corpus[corpus_dup])

Generation outputs which are also present in the corpus:


Unnamed: 0,split,sentence,target_word,target_level,context_sentence,system_id
1366,val,彼は立っていた。,立つ,N3,しかし自分ながら、なぜそんなに腹が立つのだか分からない。,2
2581,val,情報が欲しいです。,情報,N5,水の色が真赤になる情報があったであります。,3


Sources for those sentences


Unnamed: 0,sentence,source
107529,情報が欲しいです。,jpwac
305073,彼は立っていた。,tatoeba


In [11]:
# how many sentences where produced in total
print("Per system coverage of getting enough sentences on all words")
system_coverage = combined_df.value_counts(subset=['system_id']) / (len(df_target) * k * len(target_levels))
print(system_coverage)

print("Per system and level coverage of getting enough sentences on all words")
system_level_coverage = combined_df.value_counts(subset=['system_id', 'target_level']) / (len(df_target) * k)
print(system_level_coverage)


print("Which words produced less than k sentences")
word_level_coverage = combined_df.value_counts(subset=['split','system_id', 'target_level', 'target_word'])
bad_coverage = word_level_coverage[word_level_coverage < 5]

print(bad_coverage)
print("How many queries are without enough sentences:")
print(bad_coverage.count())

# investigating hajime, we see that it changed the kanji so it did not recognize it.

Per system coverage of getting enough sentences on all words
system_id
1            1.000000
2            0.987302
3            0.975661
Name: count, dtype: float64
Per system and level coverage of getting enough sentences on all words
system_id  target_level
1          N1              1.000000
           N3              1.000000
           N5              1.000000
2          N3              0.990476
           N5              0.987302
           N1              0.984127
3          N5              0.984127
           N3              0.977778
           N1              0.965079
Name: count, dtype: float64
Which words produced less than k sentences
split  system_id  target_level  target_word
val    3          N5            立つ             4
       2          N1            前              4
       3          N5            可能             4
                  N3            子供             4
                                飛ぶ             3
                                立つ             3
       

In [12]:
print("Unsolved queries by system")
display(bad_coverage.groupby(by='system_id').aggregate(len))
print("Unsolved queries by system and target level")
display(bad_coverage.groupby(by=['system_id', 'target_level']).aggregate(len)) # pretty much uniformly

# percentage wise 
total_queries_per_system = len(df_target) * len(target_levels)
print(f"Each system is queried {total_queries_per_system} times")
unsolved_queries = bad_coverage.groupby(by='system_id').aggregate(len) / total_queries_per_system
display(1 - unsolved_queries)

Unsolved queries by system


system_id
2     4
3    10
Name: count, dtype: int64

Unsolved queries by system and target level


system_id  target_level
2          N1              2
           N3              1
           N5              1
3          N1              3
           N3              4
           N5              3
Name: count, dtype: int64

Each system is queried 189 times


system_id
2    0.978836
3    0.947090
Name: count, dtype: float64

In [13]:
# Group by 'system_id' and count duplicates within each group
dup_count_by_system = combined_df[combined_df.duplicated(subset=['sentence'], keep=False)]
# Sort the counts in descending order
sorted_dup_count = dup_count_by_system.value_counts(subset=['split','system_id']).sort_values(ascending=False)

# Display the results

print('duplicated sentences by system')
sorted_dup_count = dup_count_by_system.value_counts(subset=['system_id'], sort=False)
print(sorted_dup_count.to_string())

print('duplicated sentences by system (percentage)')
sorted_dup_count = dup_count_by_system.value_counts(subset=['system_id'], sort=False) / (len(df_target) * k * len(target_levels))
print(sorted_dup_count.to_string())

print('duplicated sentences by system and split')
sorted_dup_count = dup_count_by_system.value_counts(subset=['split','system_id']).sort_values(ascending=False)
print(sorted_dup_count.to_string())

### we have that GPT3.5 is the system that reuses the most sentences for different levels
# 84 out of 

duplicated sentences by system
system_id
1            22
2             4
3            84
duplicated sentences by system (percentage)
system_id
1            0.023280
2            0.004233
3            0.088889
duplicated sentences by system and split
split  system_id
val    3            72
       1            22
test   3            12
val    2             4


In [23]:
df_dup_out.iloc[0]

split                                                             val
sentence            ■ドーナツフライオイルにはメーカーと共同開発した植物性のオイルを使用することで、油分の吸収を...
target_word                                                      サッパリ
target_level                                                       N3
context_sentence                    また聞こえてくる君尾の笑い声、はなやかで明るくてサッパリしている。
system_id                                                           1
Name: 1225, dtype: object

In [14]:
print('Outputs that are duplicates:')
df_dup_out = combined_df[combined_df.duplicated(subset=['sentence'], keep=False)][pretty_subset].sort_values(by='sentence')
print(df_dup_out.to_string())

#print(df_dup_out.value_counts(subset=['sentence','target_word','target_level', 'system_id'], sort=True).to_string())

Outputs that are duplicates:
     split                                                                 sentence target_word target_level                        context_sentence  system_id
1225   val  ■ドーナツフライオイルにはメーカーと共同開発した植物性のオイルを使用することで、油分の吸収を抑え、ベタつかずサッパリと食べやすく仕上げました。        サッパリ           N3       また聞こえてくる君尾の笑い声、はなやかで明るくてサッパリしている。          1
2160   val  ■ドーナツフライオイルにはメーカーと共同開発した植物性のオイルを使用することで、油分の吸収を抑え、ベタつかずサッパリと食べやすく仕上げました。        サッパリ           N5       また聞こえてくる君尾の笑い声、はなやかで明るくてサッパリしている。          1
1611   val                                             あの会社との関係を築くために、多くの時間を費やしました。          関係           N3   あなたの御関係なすっておいでになる男の事を、ある偶然の機会で承知しました。          3
2546   val                                             あの会社との関係を築くために、多くの時間を費やしました。          関係           N5   あなたの御関係なすっておいでになる男の事を、ある偶然の機会で承知しました。          3
1845   val                                                   このお茶はサッパリしていて、夏にぴったりだ。        サッパリ           N3       また聞こえてくる君尾の笑い声、はなやかで明るくてサッパリしている。          3
910    val 

In [15]:
combined_df['system_id'].value_counts()

system_id
1    945
2    933
3    922
Name: count, dtype: int64

In [16]:
combined_df[dup]

Unnamed: 0,sentence,index,index_hits,level,level_score,sense_score,quality_score,sentence_docs,parse_tree,tokenized,...,lexical_div_score,div_score,total_score,system_id,context_sentence,target_word,target_level,block_id,random_ordering,split
0,外交経験が無い素人2人組の外交は半年が空費され、相手から一方的に条件を呑まされる寸前になり失...,1684583.0,,N1,1.0,0.725103,0.862552,外交経験が無い素人2人組の外交は半年が空費され、相手から一方的に条件を呑まされる寸前になり失...,(ADJ_ROOT (NOUN_nsubj NOUN_compound ADP_case)),"['外交', '経験', 'が', '無い', '素人', '2', '人組', 'の', ...",...,0.939792,0.969896,0.916224,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E,test
1,ドイツの戦略爆撃機とイギリス、アメリカの戦略爆撃機の合計の多い国家が少ない国家（ドイツ又はイ...,5505912.0,,N1,1.0,0.728948,0.864474,ドイツの戦略爆撃機とイギリス、アメリカの戦略爆撃機の合計の多い国家が少ない国家（ドイツ又はイ...,(NOUN_ROOT\n (PROPN_nmod\n (PROPN_nmod\n ...,"['ドイツ', 'の', '戦略', '爆撃機', 'と', 'イギリス', '、', 'ア...",...,0.890657,0.945328,0.904901,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E,test
2,これはその言葉を発した側が、その発言を持って相手を貶めようとしているためである。,1508529.0,,N1,1.0,0.773248,0.886624,これはその言葉を発した側が、その発言を持って相手を貶めようとしているためである。,(NOUN_ROOT\n (VERB_acl\n (PRON_nsubj ADP_c...,"['これ', 'は', 'その', '言葉', 'を', '発し', 'た', '側', '...",...,0.884993,0.915592,0.901108,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E,test
3,カトコフの主張は、一般的に穏健なものではあったが、ひとたびや筆を執るや否や、痛烈に相手を批判...,3917741.0,,N1,1.0,0.778942,0.889471,カトコフの主張は、一般的に穏健なものではあったが、ひとたびや筆を執るや否や、痛烈に相手を批判...,(VERB_ROOT\n (NOUN_advcl\n (NOUN_nsubj (NO...,"['カトコフ', 'の', '主張', 'は', '、', '一般的', 'に', '穏健'...",...,0.871871,0.889312,0.889392,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E,test
4,また、両派ともに相手の絶滅を主張し、小型の出刃包丁やハンマーなどを使用した襲撃を続けたため逮...,1694412.0,,N1,1.0,0.781091,0.890545,また、両派ともに相手の絶滅を主張し、小型の出刃包丁やハンマーなどを使用した襲撃を続けたため逮...,(VERB_ROOT\n (VERB_obl\n CCONJ_cc\n PUN...,"['また', '、', '両派', 'とも', 'に', '相手', 'の', '絶滅', ...",...,0.866260,0.860982,0.875764,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2175,立つと足の裏の上です。,116062.0,,N5,1.0,0.722260,0.861130,立つと足の裏の上です。,(NOUN_ROOT\n (VERB_acl SCONJ_mark)\n (NOUN_n...,"['立つ', 'と', '足', 'の', '裏', 'の', '上', 'です', '。']",...,0.964286,0.859808,0.860469,1,ハードルに手や足をかけても問題ない。,足,N5,,,val
2176,車で長時間移動する時は、時々止まって足を伸ばさなければいけません。,329759.0,,N5,1.0,0.760393,0.880197,車で長時間移動する時は、時々止まって足を伸ばさなければいけません。,(VERB_ROOT\n (NOUN_obl\n (VERB_acl (NOUN_o...,"['車', 'で', '長時間', '移動', 'する', '時', 'は', '、', '...",...,0.947443,0.831914,0.856055,1,ハードルに手や足をかけても問題ない。,足,N5,,,val
2177,座って足を組んでみて。,366685.0,,N5,1.0,0.761630,0.880815,座って足を組んでみて。,(VERB_ROOT\n (VERB_advcl (VERB_advcl SCONJ_ma...,"['座っ', 'て', '足', 'を', '組ん', 'で', 'み', 'て', '。']",...,0.914468,0.803476,0.842146,1,ハードルに手や足をかけても問題ない。,足,N5,,,val
2178,寝てるときにさ、足がガクンってなる時ない？,311716.0,,N5,1.0,0.691118,0.845559,寝てるときにさ、足がガクンってなる時ない？,(ADJ_ROOT\n (NOUN_obl\n (VERB_acl\n (...,"['寝', 'てる', 'とき', 'に', 'さ', '、', '足', 'が', 'ガク...",...,0.910281,0.794028,0.819794,1,ハードルに手や足をかけても問題ない。,足,N5,,,val


In [17]:
only_test = df_target['is_test_reduced']
df_target[only_test]
assert(len(df_target[only_test]) == 10)

In [18]:
#test = combined_df.groupby(by=['target_level','target_word','context_sentence', 'system_id'], group_keys=False).apply(lambda x: x)

In [19]:
#combined_df.groupby(by=['target_level','target_word','context_sentence', 'system_id']).get_group(('N1', target_word, context_sentence, 1))