In [1]:
import utils
from utils import *
import pandas as pd
import numpy as np
import random
import gspread
import time

In [2]:
# this is code to make a more useful df(s) for evaluation
# we read the sentences from files which include test words and make a big df with useful columns
utils.fix_reproducibility()

target_filename = '/data/enrico_benedetti/nihongoexample/data/targets/target_words.csv'
system_dirs = ['/data/enrico_benedetti/nihongoexample/evaluation/outputs/retrieval/', '/data/enrico_benedetti/nihongoexample/evaluation/outputs/generation/llm_jp/', '/data/enrico_benedetti/nihongoexample/evaluation/outputs/generation/chatgpt/']
output_dir = "/data/enrico_benedetti/nihongoexample/evaluation/outputs/all"

df_target = pd.read_csv(target_filename)

In [3]:
target_levels = ['N1', 'N2', 'N3', 'N4', 'N5']
dfs = []
k=5
for target_level in target_levels:
    
    for system_id, system_dir in enumerate(system_dirs, start=1):

        for i, data in df_target.iterrows():
        
            target_word = data['target_word']
            context_sentence = data['context_sentence']
            # read the file which is like sys_dir + tw_tl_.csv
            sentence_file = f"{system_dir}{target_word}_{target_level}_.csv"
            try:
                df = pd.read_csv(sentence_file) # damn retrieval is different format
                # remove context sentence from sentence column... will need to change the scoring function... or at least the output.
                # actually no, all systems have it in first position, just the other column types that is different
                df['system_id'] = system_id
                df['context_sentence'] = context_sentence
                df['target_word'] = target_word
                df['target_level'] = target_level
                df.to_csv(sentence_file, index=False)
                dfs.append(df.loc[:k])
            except FileNotFoundError as e:
                pass
                #print(e)

combined_df = pd.concat(dfs, ignore_index=True)

In [6]:
lens = [len(df) for df in dfs]

In [7]:
df_info = pd.DataFrame({'len': lens})
df_info.value_counts()

len
6      679
2        4
5        4
3        3
4        3
Name: count, dtype: int64

In [8]:
dfs[688]

Unnamed: 0,sentence,target_word,context_sentence,target_level,system_id
0,自分の将来の目的に向かってバリバリ勉強していた。,バリバリ,自分の将来の目的に向かってバリバリ勉強していた。,N5,3
1,バリバリ頑張って掃除をしています。,バリバリ,自分の将来の目的に向かってバリバリ勉強していた。,N5,3
2,彼はバリバリ運動していて、とても体が鍛えられています。,バリバリ,自分の将来の目的に向かってバリバリ勉強していた。,N5,3
3,私はバリバリ日本語を勉強しています。,バリバリ,自分の将来の目的に向かってバリバリ勉強していた。,N5,3
4,彼女はバリバリ仕事をこなす能力があります。,バリバリ,自分の将来の目的に向かってバリバリ勉強していた。,N5,3
5,あの人はバリバリ料理が得意なようです。,バリバリ,自分の将来の目的に向かってバリバリ勉強していた。,N5,3


In [4]:
# check that the generated outputs are not in the dataset
corpus = load_dataset("bennexx/jp_sentences")
corpus = corpus['train'].to_pandas()

In [5]:
dup = combined_df['sentence'].isin(corpus['sentence'])

In [6]:
combined_df['system_id'].value_counts()

system_id
1    1890
2    1122
3    1111
Name: count, dtype: int64

In [7]:
sys_mask = combined_df['system_id'] == np.nan
sys_mask.value_counts()

system_id
False    4123
Name: count, dtype: int64

In [8]:
combined_df[dup]

Unnamed: 0,sentence,index,index_hits,level,level_score,sense_score,quality_score,sentence_docs,parse_tree,tokenized,syntax_div_score,lexical_div_score,div_score,total_score,system_id,context_sentence,target_word,target_level,block_id,random_ordering
1,外交経験が無い素人2人組の外交は半年が空費され、相手から一方的に条件を呑まされる寸前になり失...,1684583.0,,N1,1.0,0.725103,0.862552,外交経験が無い素人2人組の外交は半年が空費され、相手から一方的に条件を呑まされる寸前になり失...,(ADJ_ROOT (NOUN_nsubj NOUN_compound ADP_case)),"['外交', '経験', 'が', '無い', '素人', '2', '人組', 'の', ...",1.000000,0.939792,0.969896,0.916224,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E
2,ドイツの戦略爆撃機とイギリス、アメリカの戦略爆撃機の合計の多い国家が少ない国家（ドイツ又はイ...,5505912.0,,N1,1.0,0.728948,0.864474,ドイツの戦略爆撃機とイギリス、アメリカの戦略爆撃機の合計の多い国家が少ない国家（ドイツ又はイ...,(NOUN_ROOT\n (PROPN_nmod\n (PROPN_nmod\n ...,"['ドイツ', 'の', '戦略', '爆撃機', 'と', 'イギリス', '、', 'ア...",1.000000,0.890657,0.945328,0.904901,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E
3,これはその言葉を発した側が、その発言を持って相手を貶めようとしているためである。,1508529.0,,N1,1.0,0.773248,0.886624,これはその言葉を発した側が、その発言を持って相手を貶めようとしているためである。,(NOUN_ROOT\n (VERB_acl\n (PRON_nsubj ADP_c...,"['これ', 'は', 'その', '言葉', 'を', '発し', 'た', '側', '...",0.946191,0.884993,0.915592,0.901108,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E
4,カトコフの主張は、一般的に穏健なものではあったが、ひとたびや筆を執るや否や、痛烈に相手を批判...,3917741.0,,N1,1.0,0.778942,0.889471,カトコフの主張は、一般的に穏健なものではあったが、ひとたびや筆を執るや否や、痛烈に相手を批判...,(VERB_ROOT\n (NOUN_advcl\n (NOUN_nsubj (NO...,"['カトコフ', 'の', '主張', 'は', '、', '一般的', 'に', '穏健'...",0.906753,0.871871,0.889312,0.889392,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E
5,また、両派ともに相手の絶滅を主張し、小型の出刃包丁やハンマーなどを使用した襲撃を続けたため逮...,1694412.0,,N1,1.0,0.781091,0.890545,また、両派ともに相手の絶滅を主張し、小型の出刃包丁やハンマーなどを使用した襲撃を続けたため逮...,(VERB_ROOT\n (VERB_obl\n CCONJ_cc\n PUN...,"['また', '、', '両派', 'とも', 'に', '相手', 'の', '絶滅', ...",0.855703,0.866260,0.860982,0.875764,1,また、東西お互いに相手を非難するプロパガンダ放送を流し合っていた。,相手,N1,0.0,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3372,立つと足の裏の上です。,116062.0,,N5,1.0,0.722260,0.861130,立つと足の裏の上です。,(NOUN_ROOT\n (VERB_acl SCONJ_mark)\n (NOUN_n...,"['立つ', 'と', '足', 'の', '裏', 'の', '上', 'です', '。']",0.755330,0.964286,0.859808,0.860469,1,ハードルに手や足をかけても問題ない。,足,N5,,
3373,車で長時間移動する時は、時々止まって足を伸ばさなければいけません。,329759.0,,N5,1.0,0.760393,0.880197,車で長時間移動する時は、時々止まって足を伸ばさなければいけません。,(VERB_ROOT\n (NOUN_obl\n (VERB_acl (NOUN_o...,"['車', 'で', '長時間', '移動', 'する', '時', 'は', '、', '...",0.716384,0.947443,0.831914,0.856055,1,ハードルに手や足をかけても問題ない。,足,N5,,
3374,座って足を組んでみて。,366685.0,,N5,1.0,0.761630,0.880815,座って足を組んでみて。,(VERB_ROOT\n (VERB_advcl (VERB_advcl SCONJ_ma...,"['座っ', 'て', '足', 'を', '組ん', 'で', 'み', 'て', '。']",0.692484,0.914468,0.803476,0.842146,1,ハードルに手や足をかけても問題ない。,足,N5,,
3375,寝てるときにさ、足がガクンってなる時ない？,311716.0,,N5,1.0,0.691118,0.845559,寝てるときにさ、足がガクンってなる時ない？,(ADJ_ROOT\n (NOUN_obl\n (VERB_acl\n (...,"['寝', 'てる', 'とき', 'に', 'さ', '、', '足', 'が', 'ガク...",0.677775,0.910281,0.794028,0.819794,1,ハードルに手や足をかけても問題ない。,足,N5,,


In [9]:
only_test = df_target['is_test_reduced']
df_target[only_test]
assert(len(df_target[only_test]) == 10)

In [11]:
# get the spreadsheets
gc = gspread.service_account()
sh = gc.open("Evaluation-sheet-jp-v3.0")
template_sheet = sh.worksheet("10_blocks_template")


In [102]:
# block_id_cells = template_sheet.findall(re.compile(r'^Block ID$'))
# block_id_cells
# cell = block_id_cells[0]
# cell.row
# use it as pivot. no trust

In [12]:
# get the target words that we want to test
# for 2.0
# utils.fix_reproducibility()
# k = 5
# df_target_test = df_target[only_test]
# start_row = 1
# block_interval = 20 # every 20 lines there is a new block

# dfs = []
# # get the target_levels
# target_levels_test = ['N1', 'N3', 'N5']
# level_sheets = [sh.worksheet(l) for l in target_levels_test]
# #level_sheets = [template_sheet]

# for level_id, target_level in enumerate(target_levels_test):
    
#     sheet = level_sheets[level_id]

#     for i, (_, data) in enumerate(df_target_test.iterrows()):


#         # get the data : very important to do it now
#         target_word = data['target_word']
#         context_sentence = data['context_sentence']
#         # assign a block id and stuff
#         # start_col = block_id_cells[i].col - 3
#         start_col = 'A'
#         block_id = i + len(df_target_test) * level_id

#         # write i under the block id, as well as target stuff
#         row_info = start_row + i * block_interval + 1
#         sheet.update(values=[[context_sentence, target_level, target_word, block_id]],
#                                range_name=f"A{row_info}:D{row_info}")
        

#         # start the randomization
#         ordering = ['A', 'E', 'I']
#         random.shuffle(ordering)

#         for system_id, system_dir in enumerate(system_dirs, start=1):

#             # read the file which is like sys_dir + tw_tl_.csv
#             sentence_file = f"{system_dir}{target_word}_{target_level}_.csv"
#             df = pd.read_csv(sentence_file) # damn retrieval is different format
#             # remove context sentence from sentence column... will need to change the scoring function... or at least the output.
#             # actually no, all systems have it in first position, just the other column types that is different
#             df['system_id'] = system_id
#             df['context_sentence'] = context_sentence
#             df['target_word'] = target_word
#             df['target_level'] = target_level
#             df['block_id'] = block_id
#             # assign the random column among A, E, I - 1, 2, 3
#             random_ordering_letter = ordering[system_id-1]
#             df['random_ordering'] = random_ordering_letter
#             df.to_csv(sentence_file, index=False)
#             dfs.append(df)

#             # write on template sheet in random order
#             sentences = df['sentence'][1:k+1].to_list() # take 5 sentences
#             # get range
#             row_sentences = start_row + 3 + 1 + i * block_interval
#             #print(random_ordering_letter, row_sentences, random_ordering_letter, row_sentences+k)
#             sheet.update(values=[sentences],
#                 range_name=f"{random_ordering_letter}{row_sentences}",
#                 major_dimension="COLUMNS")
#     # after writing for one sheet, take a break of 1 min
#     time.sleep(63)        
    


In [18]:
## this is now for writing 30 sheets each with one block
# get the target words that we want to test
utils.fix_reproducibility()
k = 5
df_target_test = df_target[only_test]
start_row = 1
block_interval = 20 # every 20 lines there is a new block

dfs = []
# get the target_levels
target_levels_test = ['N1', 'N3', 'N5']

# each will be a copy of this, plus the pasted information
template_sheet = sh.worksheet("block_template")

for level_id, target_level in enumerate(target_levels_test):
    
    #sheet = level_sheets[level_id]

    for i, (_, data) in enumerate(df_target_test.iterrows()):


        # get the data : very important to do it now
        target_word = data['target_word']
        context_sentence = data['context_sentence']
        # assign a block id and stuff
        # start_col = block_id_cells[i].col - 3
        start_col = 'A'
        block_id = i + 1 + len(df_target_test) * level_id

        # get sheet here based on block id
        sheet = template_sheet.duplicate(new_sheet_name=f"{block_id}", insert_sheet_index=3+block_id)
        # write i under the block id, as well as target stuff
        row_info = start_row + 1
        sheet.update(values=[[context_sentence, target_level, target_word, block_id]],
                               range_name=f"A{row_info}:D{row_info}")
        

        # start the randomization
        ordering = ['A', 'E', 'I']
        random.shuffle(ordering)

        for system_id, system_dir in enumerate(system_dirs, start=1):

            # read the file which is like sys_dir + tw_tl_.csv
            sentence_file = f"{system_dir}{target_word}_{target_level}_.csv"
            df = pd.read_csv(sentence_file) # damn retrieval is different format
            # remove context sentence from sentence column... will need to change the scoring function... or at least the output.
            # actually no, all systems have it in first position, just the other column types that is different
            df['system_id'] = system_id
            df['context_sentence'] = context_sentence
            df['target_word'] = target_word
            df['target_level'] = target_level
            df['block_id'] = block_id
            # assign the random column among A, E, I - 1, 2, 3
            random_ordering_letter = ordering[system_id-1]
            df['random_ordering'] = random_ordering_letter
            df.to_csv(sentence_file, index=False)
            dfs.append(df)

            # write on template sheet in random order
            sentences = df['sentence'][1:k+1].to_list() # take 5 sentences
            # get range
            row_sentences = start_row + 3 + 1 
            #print(random_ordering_letter, row_sentences, random_ordering_letter, row_sentences+k)
            sheet.update(values=[sentences],
                range_name=f"{random_ordering_letter}{row_sentences}",
                major_dimension="COLUMNS")
        # after writing for 10 blocks sheet, take a break of 1 min
        if block_id in [9, 19]:
            time.sleep(63)        
    


  sheet.update(values=[[context_sentence, target_level, target_word, block_id]],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[[context_sentence, target_level, target_word, block_id]],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[[context_sentence, target_level, target_word, block_id]],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[[context_sentence, target_level, target_word, block_id]],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[[context_sentence, target_level, target_word, block_id]],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[sentences],
  sheet.update(values=[[context_sentence, target_level, target_word, b

In [47]:
test = combined_df.groupby(by=['target_level','target_word','context_sentence', 'system_id']).apply(lambda x: x)

In [21]:
combined_df.groupby(by=['target_level','target_word','context_sentence', 'system_id']).get_group(('N1', target_word, context_sentence, 3))

Unnamed: 0,sentence,index,index_hits,level,level_score,sense_score,quality_score,sentence_docs,parse_tree,tokenized,syntax_div_score,lexical_div_score,div_score,total_score,system_id,context_sentence,target_word,target_level,block_id,random_ordering
1082,作家と読者は、もういちど全然あたらしく地割りの協定をやり直す必要がある。,,,,,,,,,,,,,,3,作家と読者は、もういちど全然あたらしく地割りの協定をやり直す必要がある。,全然,N1,9.0,I
1083,彼女は全然教えたくなかったが、彼に自分の秘密を打ち明けた。,,,,,,,,,,,,,,3,作家と読者は、もういちど全然あたらしく地割りの協定をやり直す必要がある。,全然,N1,9.0,I
1084,昨日の試合は全然予想外の結果だった。,,,,,,,,,,,,,,3,作家と読者は、もういちど全然あたらしく地割りの協定をやり直す必要がある。,全然,N1,9.0,I
1085,その映画は全然感動しなかった。,,,,,,,,,,,,,,3,作家と読者は、もういちど全然あたらしく地割りの協定をやり直す必要がある。,全然,N1,9.0,I
1086,新しいレストランは全然味気ない料理しか提供していなかった。,,,,,,,,,,,,,,3,作家と読者は、もういちど全然あたらしく地割りの協定をやり直す必要がある。,全然,N1,9.0,I
1087,彼は全然遅刻しないと言っていたが、いつも遅れてくる。,,,,,,,,,,,,,,3,作家と読者は、もういちど全然あたらしく地割りの協定をやり直す必要がある。,全然,N1,9.0,I
