In [3]:
import utils
from utils import *
import generation
import pandas as pd
import numpy as np 
import spacy
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
import time

In [4]:
nlp = spacy.load("ja_ginza")
_ = load_dotenv(find_dotenv())
client = OpenAI()

# # with Aizawalab 
# client = OpenAI(
#   organization='org-IMH6kOvtOVHYERaByCYIIjCK',
# )


target_filename = '/data/enrico_benedetti/nihongoexample/data/targets/target_words.csv'
system_dirs = ['/data/enrico_benedetti/nihongoexample/evaluation/outputs/retrieval/', '/data/enrico_benedetti/nihongoexample/evaluation/outputs/generation/llm_jp/', '/data/enrico_benedetti/nihongoexample/evaluation/outputs/generation/chatgpt/']
output_dir = "/data/enrico_benedetti/nihongoexample/evaluation/outputs/all"

df_target = pd.read_csv(target_filename)
only_test = df_target['is_test_reduced']
assert(len(df_target[only_test]) == 10)
k = 5
df_target_test = df_target[only_test]

guidelines = ""
with open('../../evaluation/guidelines.txt', 'r') as file:
    guidelines = file.read()

example_output = ""
with open("../../evaluation/example_output.txt", 'r') as file:
    example_output = file.read()

gpt_model = "gpt-3.5-turbo"
# gpt_model = "gpft4"



In [5]:
dfs = []
# get the target_levels
target_levels_test = ['N1', 'N3', 'N5']

for level_id, target_level in enumerate(target_levels_test):
    for i, (_, data) in enumerate(df_target_test.iterrows()):
        for system_id, system_dir in enumerate(system_dirs, start=1):

            # read the file which is like sys_dir + tw_tl_.csv
            target_word = data['target_word']
            context_sentence = data['context_sentence']
            sentence_file = f"{system_dir}{target_word}_{target_level}_.csv"
            df = pd.read_csv(sentence_file)
            dfs.append(df.iloc[1:k+1]) # get only the written sentences
            
combined_df = pd.concat(dfs, ignore_index=True)


In [90]:
sys_out = get_sentences_block(combined_df, 1, 3)




for block_id in range(1,31):
    block_info = get_full_block_info(combined_df, block_id)
    print(block_info['s1']['sentence'])
    break

5    東京と大阪はライバル同士であるため、それぞれの地域では互いに相手を非難するプロパガンダ放送を...
6             これまでずっと、両国間では互いに相手を非難するプロパガンダ放送を流し合ってきた。
7                     彼と彼女の関係がうまくいかないときは、私たちは常に相手を責める。
8                   だから、彼らは互いに相手を非難するプロパガンダ放送を流し合っていた。
9                             私たちは互いに相手を尊重し合わなければならない。
Name: sentence, dtype: object


In [8]:
def get_gpt_prompt(block_info=None, combined_df=None, block_id=None):
    assert(block_info or (combined_df is not None and block_id))
    if block_id:
        block_info = get_full_block_info(combined_df, block_id)

    #midpoint = f"\nPlease rate the following annotation block, according to the guidelines (rate each sentence individually for difficulty, sense, reject):\n"
    s1 = block_info['s1']['sentence'].to_list()
    s2 = block_info['s2']['sentence'].to_list()
    s3 = block_info['s3']['sentence'].to_list()
    block_text = f"Target word: {block_info['target_word']}; target level: {block_info['target_level']}; context sentence: {block_info['context_sentence']}\nSystem 1 sentences:\n{format_sentences_gpt(s1)}\nSystem 2 sentences:\n{format_sentences_gpt(s2)}\nSystem 3 sentences:\n{format_sentences_gpt(s3)}"
    #gpt_prompt = guidelines + midpoint + block_text
    gpt_prompt = guidelines + '\nBlock to evaluate:\n' + block_text + "\nFollowing is the output structure:\n" + example_output
    return gpt_prompt

block_id = 21
#gpt_prompt = get_gpt_prompt(block_info=block_info)
gpt_prompt = get_gpt_prompt(combined_df=combined_df, block_id=block_id)
print(len(gpt_prompt))
print(gpt_prompt)

3044
This evaluation aims to rate and compare three systems in providing good example sentences for learners of Japanese at different proficiency levels.
An annotation block consists of proposed sentences by 3 systems for a target word, a context sentence and a target difficulty level.
The lists of sentences are supposed to help language learners to see diverse examples of a target word in context.

Difficulty: Rate the difficulty of each sentence according to the JLPT (Japanese Language Proficiency Test) scale, where N1 is the most difficult and N5 is the easiest.
Indicate which level a sentence belongs to (one of N1, N2, N3, N4, N5). It is possible that for the target level, the system proposes a sentence that is of a different level (higher or lower).
Below is a summary of the proficiency levels.
Level Description Summary Example
N1 Complex and abstract Japanese across various contexts. 「現代社会において、グローバル化の進展が国際的なコミュニケーションの重要性を高めており、異文化間の相互理解が求められている。」
N2 Everyday Japanese in varied si

In [9]:
completion = client.chat.completions.create(
  model=gpt_model,
  temperature = 0.0,
  seed = 42,
  n = 3,
  messages=[
    {"role": "user", "content": gpt_prompt}
  ]
)

response = "".join([completion.choices[x].message.content for x in range(3)])
if completion.choices[0].finish_reason == 'length':
    print("Warning: Output may be incomplete due to token limit.")
    warning = "Warning: Output may be incomplete due to token limit.\n"
else: warning = ""

In [11]:
print(response)
# with open(f"../../evaluation/annotation/gpt/{gpt_model}_{block_id}.txt", 'a') as file:
#     file.writelines([warning, gpt_prompt, "\n===========RESPONSE===========\n", response])

System 1 sentences:
1. Difficulty: N5; Sense: similar; Reject: No
2. Difficulty: N5; Sense: not similar; Reject: No
3. Difficulty: N5; Sense: not similar; Reject: No
4. Difficulty: N5; Sense: not similar; Reject: No
5. Difficulty: N5; Sense: not similar; Reject: No
Syntactic similarity: Low

System 2 sentences:
1. Difficulty: N4; Sense: similar; Reject: No
2. Difficulty: N4; Sense: similar; Reject: No
3. Difficulty: N4; Sense: similar; Reject: No
4. Difficulty: N4; Sense: similar; Reject: No
5. Difficulty: N4; Sense: similar; Reject: No
Syntactic similarity: Medium

System 3 sentences:
1. Difficulty: N3; Sense: not similar; Reject: No
2. Difficulty: N3; Sense: similar; Reject: No
3. Difficulty: N4; Sense: similar; Reject: No
4. Difficulty: N4; Sense: similar; Reject: No
5. Difficulty: N4; Sense: similar; Reject: No
Syntactic similarity: Medium

System ranking: System 2 > System 3 > System 1

Comment: System 2 provides the most diverse and useful example sentences for the target word at

In [16]:
utils.fix_reproducibility()
## experiments to try:

# login with NII api key organization
# with Aizawalab 
client = OpenAI(
  organization='org-IMH6kOvtOVHYERaByCYIIjCK',
)
gpt_model = "gpt-4-0125-preview"
# how to rate one block: paste the guidelines, then the block in a certain format
block_ids = range(1,31)
# block_ids = [1]
# send request to chatgpt4
n = 1
for block_id in block_ids:
  gpt_prompt = get_gpt_prompt(combined_df=combined_df, block_id=block_id)
  print(f"Requesting for {block_id}")
  completion = client.chat.completions.create(
                    model=gpt_model,
                    temperature = 0.0,
                    seed = 42,
                    n = n,
                    max_tokens=1000,
                    messages=[
                      {"role": "user", "content": gpt_prompt}])

  response = "".join([completion.choices[x].message.content for x in range(n)])
  if completion.choices[0].finish_reason == 'length':
      print("Warning: Output may be incomplete due to token limit.")
      warning = "Warning: Output may be incomplete due to token limit.\n"
  else: warning = ""
  # save output in with the block number file 
  with open(f"../../evaluation/annotation/gpt/{gpt_model}_{block_id}.txt", 'a') as file:
      file.writelines([warning, gpt_prompt, "\n===========RESPONSE===========\n", response, '\n+++++END_RESPONSE+++++\n'])
# then later i will manually insert it in sheets

Requesting for 1
Requesting for 2
Requesting for 3
Requesting for 4
Requesting for 5
Requesting for 6
Requesting for 7
Requesting for 8
Requesting for 9
Requesting for 10
Requesting for 11
Requesting for 12
Requesting for 13
Requesting for 14
Requesting for 15
Requesting for 16
Requesting for 17
Requesting for 18
Requesting for 19
Requesting for 20
Requesting for 21
Requesting for 22
Requesting for 23
Requesting for 24
Requesting for 25
Requesting for 26
Requesting for 27
Requesting for 28
Requesting for 29
Requesting for 30


In [None]:
len(guidelines)