## Imports

In [3]:
import json
from openai import OpenAI
from tqdm import tqdm
from collections import Counter
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction, sentence_bleu
import pandas as pd

## Load dataset

In [4]:
import json
import os

def is_inside_frame(text, frame):
    text_center_x = text["x"] + text["w"] / 2
    text_center_y = text["y"] + text["h"] / 2
    return (
        frame["x"] <= text_center_x <= frame["x"] + frame["w"] and
        frame["y"] <= text_center_y <= frame["y"] + frame["h"]
    )

# Load the JSON data
with open(r".\open-mantra-dataset\annotation.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Extract the testing sets
dataset_bubble = []
dataset_frame = []
dataset_page = []

for book in data:
    book_title = book["book_title"]
    for page in book["pages"]:
        page_index = page["page_index"]
        page_path = os.path.join("open-mantra-dataset", page["image_paths"]["ja"])
        page_text_ja = []
        page_text_en = []

        for text in page["text"]:
            if "text_ja" in text and "text_en" in text:
                dataset_bubble.append({
                    "text_ja": f"{text['text_ja']}",
                    "text_en": f"{text['text_en']}"
                })

        for frame_index, frame in enumerate(page["frame"]):
            frame_path = os.path.join("open-mantra-dataset_frames", "images", book_title, "ja", "frames", f"{page_index:03d}_frame_{frame_index}.jpg")
            frame_text_ja = []
            frame_text_en = []

            for text in page["text"]:
                if is_inside_frame(text, frame):
                    frame_text_ja.append(f"{text['text_ja']}")
                    frame_text_en.append(f"{text['text_en']}")

            if frame_text_ja and frame_text_en:
                dataset_frame.append({
                    "text_ja": "\n".join(frame_text_ja),
                    "text_en": "\n".join(frame_text_en),
                    "frame_path": frame_path
                })
                page_text_ja.append("\n".join(frame_text_ja))
                page_text_en.append("\n".join(frame_text_en))
            else:
                dataset_frame.append({
                    "text_ja": "",
                    "text_en": "",
                    "frame_path": frame_path
                })

        if any(page_text_ja) and any(page_text_en):
            dataset_page.append({
                "text_ja": "\n".join(page_text_ja),
                "text_en": "\n".join(page_text_en),
                "page_path": page_path
            })
        else:
            dataset_page.append({
                "text_ja": "",
                "text_en": "",
                "page_path": page_path
            })

### Visualize dataset by bubbles

In [5]:
def visualize_bubble_dataset(dataset, limit=5):
    print(f"Bubble Dataset: {len(dataset)} lines")
    for i, item in enumerate(dataset[:limit], 1):
        print(f"* Bubble {i}:")
        print(f"   * Japanese: {item['text_ja']}")
        print(f"   * English: {item['text_en']}")
        print()

visualize_bubble_dataset(dataset_bubble)

Bubble Dataset: 1592 lines
* Bubble 1:
   * Japanese: 綴じ眼のシオラ
   * English: bound eye siora

* Bubble 2:
   * Japanese: 朽鷹みつき
   * English: Mitsuki Kuchitaka

* Bubble 3:
   * Japanese: だからっ
   * English: I'm telling you!!

* Bubble 4:
   * Japanese: 知らないって言ってるだろっ
   * English: I don't know what you're talking about!

* Bubble 5:
   * Japanese: そんな借金なんて!
   * English: i don't owe you!



### Visualize dataset by frames

In [6]:
def visualize_frame_dataset(dataset, limit=5):
    print(f"Frame Dataset: {len(dataset)} frames")
    for i, item in enumerate(dataset[:limit], 1):
        print(f"* Frame {i}:")
        print(f"   * Image path: {item['frame_path']}")
        if item['text_ja'] and item['text_en']:
            bubbles = zip(item['text_ja'].split('\n'), item['text_en'].split('\n'))
            for j, (text_ja, text_en) in enumerate(bubbles, 1):
                print(f"   * Bubble {j}:")
                print(f"      * Japanese: {text_ja}")
                print(f"      * English: {text_en}")
        else:
            print("   * Empty frame")
        print()

visualize_frame_dataset(dataset_frame, limit=5)



Frame Dataset: 1069 frames
* Frame 1:
   * Image path: open-mantra-dataset_frames\images\tojime_no_siora\ja\frames\001_frame_0.jpg
   * Bubble 1:
      * Japanese: 綴じ眼のシオラ
      * English: bound eye siora
   * Bubble 2:
      * Japanese: 朽鷹みつき
      * English: Mitsuki Kuchitaka

* Frame 2:
   * Image path: open-mantra-dataset_frames\images\tojime_no_siora\ja\frames\002_frame_0.jpg
   * Empty frame

* Frame 3:
   * Image path: open-mantra-dataset_frames\images\tojime_no_siora\ja\frames\002_frame_1.jpg
   * Bubble 1:
      * Japanese: だからっ
      * English: I'm telling you!!

* Frame 4:
   * Image path: open-mantra-dataset_frames\images\tojime_no_siora\ja\frames\002_frame_2.jpg
   * Bubble 1:
      * Japanese: 知らないって言ってるだろっ
      * English: I don't know what you're talking about!
   * Bubble 2:
      * Japanese: そんな借金なんて!
      * English: i don't owe you!

* Frame 5:
   * Image path: open-mantra-dataset_frames\images\tojime_no_siora\ja\frames\002_frame_3.jpg
   * Bubble 1:
      * Japanes

### Visualize dataset by pages

In [7]:
def visualize_page_dataset(dataset, limit=3):
    print(f"Page Dataset: {len(dataset)} pages")
    for i, item in enumerate(dataset[:limit], 1):
        print(f"* Page {i}:")
        print(f"   * Image path: {item['page_path']}")
        if item['text_ja'] and item['text_en']:
            bubbles = zip(item['text_ja'].split('\n'), item['text_en'].split('\n'))
            for k, (bubble_ja, bubble_en) in enumerate(bubbles, 1):
                print(f"      * Bubble {k}:")
                print(f"         * Japanese: {bubble_ja}")
                print(f"         * English: {bubble_en}")
        else:
            print("   * Empty page")
        print()
visualize_page_dataset(dataset_page, limit=2)

Page Dataset: 214 pages
* Page 1:
   * Image path: open-mantra-dataset\images/tojime_no_siora/ja/000.jpg
      * Bubble 1:
         * Japanese: 綴じ眼のシオラ
         * English: bound eye siora
      * Bubble 2:
         * Japanese: 朽鷹みつき
         * English: Mitsuki Kuchitaka

* Page 2:
   * Image path: open-mantra-dataset\images/tojime_no_siora/ja/001.jpg
      * Bubble 1:
         * Japanese: だからっ
         * English: I'm telling you!!
      * Bubble 2:
         * Japanese: 知らないって言ってるだろっ
         * English: I don't know what you're talking about!
      * Bubble 3:
         * Japanese: そんな借金なんて!
         * English: i don't owe you!
      * Bubble 4:
         * Japanese: そうは言ってもなぁ
         * English: well, I'm sorry...
      * Bubble 5:
         * Japanese: レーネ...
         * English: lene...



## Define Functions

In [13]:
def ask_question(question, client, max_tokens=200, temperature=1, top_p=0.9):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                'role': 'user',
                'content': question,
            }
        ],
        model='llama3',
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stream=False
    )
    answer = chat_completion.choices[0].message.content
    return answer

def calculate_bleu(predictions, references):
    
    reference_corpus = [[ref.split()] for ref in references]
    prediction_corpus = [pred.split() for pred in predictions]
    #print("Reference", reference_corpus)
    #print("Prediction", prediction_corpus)
    # Create a smoothing function
    smoothie = SmoothingFunction().method4
    bleu4 = corpus_bleu(reference_corpus, prediction_corpus, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    return bleu4

### Model evaluation functions

#### Bubbles

In [14]:

def evaluate_model_unimodal_bubbles(client, test_dataset, prompt):
    data = []
    references = []
    for example in tqdm(test_dataset, desc="Evaluating"):
        sentence = example['text_ja']
        ground_truth = example['text_en']
        predicted_translation = translate_llm_bubbles(sentence, prompt, client)
        data.append({
            'sentence': sentence,
            'ground_truth': ground_truth,
            'predicted_translation': predicted_translation
        })
        references.append(ground_truth)
    
    df = pd.DataFrame(data)
    bleu = calculate_bleu(df['predicted_translation'].tolist(), references)
    return df, bleu

def translate_llm_bubbles(sentence, prompt, client):
    question = prompt.format(text=sentence)
    answer = ask_question(question, client)

    # Split the translations into sentences
    sentences = answer.split("\n")
    
    # Initialize an empty list to store the translated sentences
    translated_sentences = []
    
    # Iterate over each sentence
    for sentence in sentences:
        # Check if the sentence starts with a number followed by a dot and a space
        if sentence.strip() and sentence.split(".", 1)[0].isdigit():
            # Remove the number and dot from the beginning of the sentence
            sentence = sentence.split(".", 1)[1].strip()
        
            # Append the processed sentence to the list of translated sentences
            translated_sentences.append(sentence)
    
    # Join the translated sentences into a single string
    translated_text = "\n".join(translated_sentences)
    
    #print(translated_text)
    return translated_text

### Frames

In [15]:
def evaluate_model_unimodal_frames(client, test_dataset, prompt):
    data = []
    references = []
    predictions = []
    
    for example in tqdm(test_dataset, desc="Evaluating"):
        frame_path = example['frame_path']
        bubbles_ja = example['text_ja'].split('\n')
        bubbles_en = example['text_en'].split('\n')
        
        if not bubbles_ja or all(not bubble.strip() for bubble in bubbles_ja):
            #print("*Empty frame*")
            continue
        
        bubbles_input = '\n'.join([f"{i+1}. {bubble}" for i, bubble in enumerate(bubbles_ja)])
        
        predicted_translation = translate_llm_frames(bubbles_input, prompt, client)
        predicted_lines = predicted_translation.split('\n')
        
        for line_ja, line_en, line_predicted in zip(bubbles_ja, bubbles_en, predicted_lines):
            data.append({
                'frame_path': frame_path,
                'bubble_ja': line_ja,
                'bubble_en': line_en,
                'predicted_translation': line_predicted
            })
        
        references.append(' '.join(bubbles_en))
        predictions.append(' '.join(predicted_lines))
    
    df = pd.DataFrame(data)
    bleu = calculate_bleu(predictions, references)
    return df, bleu

def translate_llm_frames(bubbles_input, prompt, client):
    #print("--------Text input: ---------\n", bubbles_input)
    question = prompt.format(text=bubbles_input)
    #print("--------Prompt: ---------\n", question)
    answer = ask_question(question, client)
    #print("--------Raw answer: ---------\n", answer)

    # Split the translations into sentences
    sentences = answer.split("\n")
    
    # Initialize an empty list to store the translated sentences
    translated_sentences = []
    
    # Iterate over each sentence
    for sentence in sentences:
        # Check if the sentence starts with a number followed by a dot and a space
        if sentence.strip() and sentence.split(". ", 1)[0].isdigit():
            # Remove the number and dot from the beginning of the sentence
            sentence = sentence.split(". ", 1)[1].strip()
        
            # Append the processed sentence to the list of translated sentences
            translated_sentences.append(sentence)
    
    # Join the translated sentences into a single string
    translated_text = "\n".join(translated_sentences)
    #print("--------Final answer: ---------")
    #print(translated_text)
    return translated_text

In [70]:
from itertools import zip_longest
from multimodal_api import process_image

def evaluate_model_multimodal_frames(client, test_dataset, prompt, description_prompt):
    data = []
    references = []
    predictions = []
    context = ""
    current_page = None
    frame_descriptions = []
    frame_texts = []

    for example in tqdm(test_dataset, desc="Evaluating"):
        frame_path = example['frame_path']
        page_number = int(os.path.basename(frame_path).split('_')[0])
        frame_number = int(os.path.basename(frame_path).split('_')[-1].split('.')[0])
        bubbles_ja = example['text_ja'].split('\n')
        bubbles_en = example['text_en'].split('\n')

        if current_page is None or current_page != page_number:
            if current_page is not None:
                # Process the accumulated frame descriptions and texts for the previous page
                translation_input = ""
                text_counter = 1
                for i, (description, text) in enumerate(zip(frame_descriptions, frame_texts)):
                    frame_info = f"Frame {i+1}:\n"
                    frame_info += f"Description: {description}\n"
                    frame_info += "Text: "
                    if text:
                        lines = text.split('\n')
                        for line in lines:
                            if line.strip():  # Skip empty lines
                                frame_info += f"{text_counter}. {line.split('. ', 1)[-1]}\n"
                                text_counter += 1
                    else:
                        frame_info += "None\n"
                    frame_info += "\n"
                    translation_input += frame_info

                predicted_translation = translate_llm_frames(translation_input, prompt, client)
                predicted_lines = predicted_translation.split('\n')

                for line_ja, line_en, line_predicted in zip_longest(frame_texts, bubbles_en, predicted_lines, fillvalue=""):
                    data.append({
                        'frame_path': frame_path,
                        'bubble_ja': line_ja,
                        'bubble_en': line_en,
                        'predicted_translation': line_predicted
                    })
                references.append(' '.join(bubbles_en))
                predictions.append(' '.join(predicted_lines))

            current_page = page_number
            context = ""
            frame_descriptions = []
            frame_texts = []

        # Generate frame description
        question = description_prompt.format(frame_number=frame_number, context=context)
        parameters = {
            "params_form": "Sampling",
            "num_beams": 5,
            "repetition_penalty": 1.3,
            "top_p": 0.9,
            "top_k": 50,
            "temperature": 0.8
        }
        frame_description = process_image(frame_path, question, parameters)
        frame_descriptions.append(frame_description)

        # Update context with the current frame description
        context += f"{frame_number}. {frame_description}\n"

        if not bubbles_ja or all(not bubble.strip() for bubble in bubbles_ja):
            bubbles_input = ""
        else:
            bubbles_input = '\n'.join([f"{i+1}. {bubble}" for i, bubble in enumerate(bubbles_ja)])
        frame_texts.append(bubbles_input)

    # Process the last page
    if frame_descriptions and frame_texts:
        translation_input = ""
        text_counter = 1
        for i, (description, text) in enumerate(zip(frame_descriptions, frame_texts)):
            frame_info = f"Frame {i+1}:\n"
            frame_info += f"Description: {description}\n"
            frame_info += "Text: "
            if text:
                lines = text.split('\n')
                for line in lines:
                    if line.strip():  # Skip empty lines
                        frame_info += f"{text_counter}. {line.split('. ', 1)[-1]}\n"
                        text_counter += 1
            else:
                frame_info += "None\n"
            frame_info += "\n"
            translation_input += frame_info

        predicted_translation = translate_llm_frames(translation_input, prompt, client)
        predicted_lines = predicted_translation.split('\n')

        for line_ja, line_en, line_predicted in zip_longest(frame_texts, bubbles_en, predicted_lines, fillvalue=""):
            data.append({
                'frame_path': frame_path,
                'bubble_ja': line_ja,
                'bubble_en': line_en,
                'predicted_translation': line_predicted
            })
        references.append(' '.join(bubbles_en))
        predictions.append(' '.join(predicted_lines))

    df = pd.DataFrame(data)
    bleu = calculate_bleu(predictions, references)
    return df, bleu

def translate_llm_frames(translation_input, prompt, client):
    print("--------Text input: ---------\n", translation_input)
    question = prompt.format(input=translation_input)
    #print("--------Prompt: ---------\n", question)
    answer = ask_question(question, client)
    print("--------Raw answer: ---------\n", answer)

    # Split the translations into sentences
    sentences = answer.split("\n")
    
    # Initialize an empty list to store the translated sentences
    translated_sentences = []
    
    # Iterate over each sentence
    for sentence in sentences:
        # Check if the sentence starts with a number followed by a dot and a space
        if sentence.strip() and sentence.split(". ", 1)[0].isdigit():
            # Remove the number and dot from the beginning of the sentence
            sentence = sentence.split(". ", 1)[1].strip()
        
            # Append the processed sentence to the list of translated sentences
            translated_sentences.append(sentence)
    
    # Join the translated sentences into a single string
    translated_text = "\n".join(translated_sentences)
    print("--------Final answer: ---------")
    print(translated_text)
    return translated_text

### Pages

In [16]:
def evaluate_model_unimodal_pages(client, test_dataset, prompt):
    data = []
    references = []
    predictions = []
    
    for example in tqdm(test_dataset, desc="Evaluating"):
        frame_path = example['page_path']
        bubbles_ja = example['text_ja'].split('\n')
        bubbles_en = example['text_en'].split('\n')
        
        if not bubbles_ja or all(not bubble.strip() for bubble in bubbles_ja):
            #print("*Empty page]*")
            continue
        
        bubbles_input = '\n'.join([f"{i+1}. {bubble}" for i, bubble in enumerate(bubbles_ja)])
        
        predicted_translation = translate_llm_pages(bubbles_input, prompt, client)
        predicted_lines = predicted_translation.split('\n')
        
        for line_ja, line_en, line_predicted in zip(bubbles_ja, bubbles_en, predicted_lines):
            data.append({
                'page_path': frame_path,
                'bubble_ja': line_ja,
                'bubble_en': line_en,
                'predicted_translation': line_predicted
            })
        
        references.append(' '.join(bubbles_en))
        predictions.append(' '.join(predicted_lines))
    
    df = pd.DataFrame(data)
    bleu = calculate_bleu(predictions, references)
    return df, bleu

def translate_llm_pages(bubbles_input, prompt, client):
    #print("--------Text input: ---------\n", bubbles_input)
    question = prompt.format(text=bubbles_input)
    #print("--------Prompt: ---------\n", question)
    answer = ask_question(question, client)
    #print("--------Raw answer: ---------\n", answer)

    # Split the translations into sentences
    sentences = answer.split("\n")
    
    # Initialize an empty list to store the translated sentences
    translated_sentences = []
    
    # Iterate over each sentence
    for sentence in sentences:
        # Check if the sentence starts with a number followed by a dot and a space
        if sentence.strip() and sentence.split(". ", 1)[0].isdigit():
            # Remove the number and dot from the beginning of the sentence
            sentence = sentence.split(". ", 1)[1].strip()
        
            # Append the processed sentence to the list of translated sentences
            translated_sentences.append(sentence)
    
    # Join the translated sentences into a single string
    translated_text = "\n".join(translated_sentences)
    #print("--------Final answer: ---------")
    #print(translated_text)
    return translated_text

In [17]:
# Define your desired weights (example: higher weight for bi-grams)
weights = (0.25, 0.25, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram

# Reference and predicted texts (same as before)
reference = [["the", "picture", "is", "clicked", "by", "me"]]
predictions = ["the", "picture", "the", "picture", "by", "me"]

# Calculate BLEU score with weights
score = sentence_bleu(reference, predictions, weights=weights)
print(score)

0.7186082239261684


## Prompts

### Only text

#### Zero shot

In [247]:
# Bubble dataset
prompt_text_zeroshot_bubbles = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interaction. 
- You'll be penalized if you add any additional information besides the format provided or if you don't follow the format provided.

Here's the format.

Input:
1. Sentence in Japanese

Translation:
1. Translation in English

Input:
1. {text}

Translation:

"""

# Frame dataset
prompt_text_zeroshot_frames = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interactions.
- You'll be penalized if you add any additional information besides the format provided or if you don't follow the format provided.

Here's the format. Each sentence corresponds to one text bubble in a manga frame. Have in mind that this is a sequence of dialogs in manga. 

Input:
1. Sentence 1 in Japanese
2. Sentence 2 in Japanese
3. Sentence 3 in Japanese

Translation:
1. Translation 1 in English
2. Translation 2 in English
3. Translation 3 in English

Input:
{text}

Translation:

"""

# Page dataset
prompt_text_zeroshot_pages = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interactions. Simply use the format for translations.

Here's the format. Each sentence corresponds to one text bubble in a manga page. Have in mind that this is a sequence of dialogs in manga. 

Input:
1. Sentence 1 in Japanese
2. Sentence 2 in Japanese
3. Sentence 3 in Japanese

Translation:
1. Translation 1 in English
2. Translation 2 in English
3. Translation 3 in English

Input:
{text}

Translation:
"""

#### One shot

In [248]:
# Bubble dataset
prompt_text_oneshot_bubbles = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interaction. 
- You'll be penalized if you add any additional information besides the format provided or if you don't follow the format provided.

Here's the format.

Input:
1. Sentence in Japanese

Translation:
1. Translation in English


Input:
1. いいって言ったじゃないですか

Translation:
1. I told you I'm okay!


Input:
1. {text}

Translation:

"""

# Frame dataset
prompt_text_oneshot_frames = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interactions.
- You'll be penalized if you add any additional information besides the format provided or if you don't follow the format provided.

Here's the format. Each sentence corresponds to one text bubble in a manga frame. Have in mind that this is a sequence of dialogs in manga. 

Input:
1. Sentence 1 in Japanese
2. Sentence 2 in Japanese
3. Sentence 3 in Japanese

Translation:
1. Translation 1 in English
2. Translation 2 in English
3. Translation 3 in English


Input:
1. たっ助かった
2. シオラといったか
3. ウチに雇ってほしいそうじゃないか

Translation:
1. oh, thank god.
2. siora, was it?
3. i heard you wanted us to hire you?


Input:
{text}

Translation:

"""

# Page dataset
prompt_text_oneshot_pages = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interactions. Simply use the format for translations.

Here's the format. Each sentence corresponds to one text bubble in a manga page. Have in mind that this is a sequence of dialogs in manga. 

Input:
1. Sentence 1 in Japanese
2. Sentence 2 in Japanese
3. Sentence 3 in Japanese

Translation:
1. Translation 1 in English
2. Translation 2 in English
3. Translation 3 in English


Input:
1. じゃあなんで
2. 父さんは死んだんだ!!
3. まっ待て
4. 撃たないでくれ
5. 死ね!
6. 死ねぇーーー!!

Translation:
1. then why?!
2. why did my father have to die!!
3. wait, wait.
4. don't shoot me.
5. die!
6. die!!!!


Input:
{text}

Translation:
"""

### Multimodal

#### Zero shot

In [72]:
# Frame dataset
prompt_multimodal_text_zeroshot_frames = """You are a manga translator from Japanese to English, considering the context from the manga frames.
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once, taking into account the context provided by the frame descriptions.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes, or interactions.
- You'll be penalized if you add any additional information besides the format provided or if you don't follow the format provided.
Here's the format. Each sentence corresponds to one text bubble in a manga frame. Have in mind that this is a sequence of dialogs in manga, and the frame descriptions provide additional context for the translation.
{input}
Translation:
"""

prompt_multimodal_vision_zeroshot_frames = """You are an AI assistant that describes manga frames, considering the context from previous frames.
Some rules to remember:
- Analyze the current frame carefully and describe the content, characters and actions.
- Consider the context from the previous frames to maintain continuity and coherence in your descriptions.
- Write the description as if it were the text in the manga bubbles, capturing the essence of the and actions.
- Do not add any additional explanations or interactions beyond the frame description.
- You'll be penalized if you add any extra information besides the format provided or if you don't follow the format.

You're looking at frame #{frame_number}
Frame descriptions:
{context}
{frame_number}. 

Frame description:
"""

In [46]:
def process_image_dummy(frame_path, question, parameters):
    #print(question)
    return "This is a description for" + frame_path

#### One shot

In [None]:
# Bubble dataset
prompt_text_oneshot_bubbles = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interaction. 
- You'll be penalized if you add any additional information besides the format provided or if you don't follow the format provided.

Here's the format.

Input:
1. Sentence in Japanese

Translation:
1. Translation in English


Input:
1. いいって言ったじゃないですか

Translation:
1. I told you I'm okay!


Input:
1. {text}

Translation:

"""

# Frame dataset
prompt_text_oneshot_frames = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interactions.
- You'll be penalized if you add any additional information besides the format provided or if you don't follow the format provided.

Here's the format. Each sentence corresponds to one text bubble in a manga frame. Have in mind that this is a sequence of dialogs in manga. 

Input:
1. Sentence 1 in Japanese
2. Sentence 2 in Japanese
3. Sentence 3 in Japanese

Translation:
1. Translation 1 in English
2. Translation 2 in English
3. Translation 3 in English


Input:
1. たっ助かった
2. シオラといったか
3. ウチに雇ってほしいそうじゃないか

Translation:
1. oh, thank god.
2. siora, was it?
3. i heard you wanted us to hire you?


Input:
{text}

Translation:

"""

# Page dataset
prompt_text_oneshot_pages = """You are a manga translator from Japanese to English. 
Some rules to remember:
- Maintaining the contents' accuracy is important, but since texts are from manga, we want to prioritize naturalness and ease of communication.
- Instead of translating word by word, try to translate the whole sentence or phrase at once.
- Number of translated sentences should be the same as the number of input sentences.
- Return translations without additional explanations, comments, notes or interactions. Simply use the format for translations.

Here's the format. Each sentence corresponds to one text bubble in a manga page. Have in mind that this is a sequence of dialogs in manga. 

Input:
1. Sentence 1 in Japanese
2. Sentence 2 in Japanese
3. Sentence 3 in Japanese

Translation:
1. Translation 1 in English
2. Translation 2 in English
3. Translation 3 in English


Input:
1. じゃあなんで
2. 父さんは死んだんだ!!
3. まっ待て
4. 撃たないでくれ
5. 死ね!
6. 死ねぇーーー!!

Translation:
1. then why?!
2. why did my father have to die!!
3. wait, wait.
4. don't shoot me.
5. die!
6. die!!!!


Input:
{text}

Translation:
"""

## Run 

### Zero-shot (Only text)

#### Bubbles dataset

In [183]:
dataset = dataset_bubble
#dataset = dataset_bubble[:5]

# ollama running on local
client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
df, bleu4 = evaluate_model_unimodal_bubbles(client, dataset, prompt_text_zeroshot_bubbles)
print(f"Zero shot:\nBLEU4 Score: {bleu4*100:.4f}")

Evaluating: 100%|██████████| 1592/1592 [05:50<00:00,  4.54it/s]

BLEU4 Score: 2.5735





In [184]:
df

Unnamed: 0,sentence,ground_truth,predicted_translation
0,綴じ眼のシオラ,bound eye siora,Wrapped-around-eyes Siola
1,朽鷹みつき,Mitsuki Kuchitaka,Rustling feathers of death
2,だからっ,I'm telling you!!,
3,知らないって言ってるだろっ,I don't know what you're talking about!,"You're saying that again, aren't you?"
4,そんな借金なんて!,i don't owe you!,That kind of debt isn't worth it!
...,...,...,...
1587,こりゃかなわん,you beat me,This can't be beat.
1588,さて,well!\n,
1589,さて明日は墓掃除だ!,tomorrow we clean the graves!,"So, tomorrow is a funeral duty!"
1590,こ、こら,hey!!!,


#### Frames dataset

In [185]:
dataset = dataset_frame
#dataset = dataset_frame[:5]

# ollama running on local
client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
df, bleu4 = evaluate_model_unimodal_frames(client, dataset, prompt_text_zeroshot_frames)
print(f"Zero shot:\nBLEU4 Score: {bleu4*100:.4f}")

Evaluating: 100%|██████████| 1069/1069 [05:11<00:00,  3.43it/s]

BLEU4 Score: 2.8556





In [186]:
df

Unnamed: 0,frame_path,bubble_ja,bubble_en,predicted_translation
0,open-mantra-dataset_frames\images\tojime_no_si...,綴じ眼のシオラ,bound eye siora,Siola's stitched eyes.
1,open-mantra-dataset_frames\images\tojime_no_si...,朽鷹みつき,Mitsuki Kuchitaka,Withered eagle beak.
2,open-mantra-dataset_frames\images\tojime_no_si...,だからっ,I'm telling you!!,Then?
3,open-mantra-dataset_frames\images\tojime_no_si...,知らないって言ってるだろっ,I don't know what you're talking about!,"You're telling me you don't know, right?"
4,open-mantra-dataset_frames\images\tojime_no_si...,そんな借金なんて!,i don't owe you!,Stuff like that debt? Not on my life!
...,...,...,...,...
1624,open-mantra-dataset_frames\images\rasetugari\j...,こりゃかなわん,you beat me,I'm not buying it.
1625,open-mantra-dataset_frames\images\rasetugari\j...,さて,well!,"So, then."
1626,open-mantra-dataset_frames\images\rasetugari\j...,さて明日は墓掃除だ!,,"Oh no, tomorrow's grave-cleaning duty!"
1627,open-mantra-dataset_frames\images\rasetugari\j...,こ、こら,tomorrow we clean the graves!,"Uh-oh, uh-oh."


#### Pages dataset

In [216]:
dataset = dataset_page
#dataset = dataset_page[:5]

# ollama running on local
client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
df, bleu4 = evaluate_model_unimodal_pages(client, dataset, prompt_text_zeroshot_pages)
print(f"Zero shot:\nBLEU4 Score: {bleu4*100:.4f}")

Evaluating: 100%|██████████| 214/214 [04:18<00:00,  1.21s/it]

Zero shot:
BLEU4 Score: 3.5923





In [218]:
df

Unnamed: 0,page_path,bubble_ja,bubble_en,predicted_translation
0,open-mantra-dataset\images/tojime_no_siora/ja/...,綴じ眼のシオラ,bound eye siora,Sioia's knitted eyes
1,open-mantra-dataset\images/tojime_no_siora/ja/...,朽鷹みつき,Mitsuki Kuchitaka,The rotting falcon's beak
2,open-mantra-dataset\images/tojime_no_siora/ja/...,だからっ,I'm telling you!!,That's why.
3,open-mantra-dataset\images/tojime_no_siora/ja/...,知らないって言ってるだろっ,I don't know what you're talking about!,He's just saying he doesn't know!
4,open-mantra-dataset\images/tojime_no_siora/ja/...,そんな借金なんて!,i don't owe you!,You can't say something like that!
...,...,...,...,...
1617,open-mantra-dataset\images/rasetugari/ja/053.jpg,こりゃかなわん,you beat me,It's impossible for that to happen.
1618,open-mantra-dataset\images/rasetugari/ja/053.jpg,さて,well!,Alright then.
1619,open-mantra-dataset\images/rasetugari/ja/053.jpg,さて明日は墓掃除だ!,,"Okay, tomorrow is going to be a funeral prepar..."
1620,open-mantra-dataset\images/rasetugari/ja/053.jpg,こ、こら,tomorrow we clean the graves!,"Hey, hey!"


### One-shot Prompt (Only text)

#### Bubbles dataset

In [203]:
dataset = dataset_bubble
#dataset = dataset_bubble[:5]

# ollama running on local
client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
df, bleu4 = evaluate_model_unimodal_bubbles(client, dataset, prompt_text_oneshot_bubbles)
print(f"One shot:\nBLEU4 Score: {bleu4*100:.4f}")

Evaluating: 100%|██████████| 1592/1592 [05:33<00:00,  4.77it/s]


One shot:
BLEU4 Score: 3.2316


In [217]:
df

Unnamed: 0,page_path,bubble_ja,bubble_en,predicted_translation
0,open-mantra-dataset\images/tojime_no_siora/ja/...,綴じ眼のシオラ,bound eye siora,Sioia's knitted eyes
1,open-mantra-dataset\images/tojime_no_siora/ja/...,朽鷹みつき,Mitsuki Kuchitaka,The rotting falcon's beak
2,open-mantra-dataset\images/tojime_no_siora/ja/...,だからっ,I'm telling you!!,That's why.
3,open-mantra-dataset\images/tojime_no_siora/ja/...,知らないって言ってるだろっ,I don't know what you're talking about!,He's just saying he doesn't know!
4,open-mantra-dataset\images/tojime_no_siora/ja/...,そんな借金なんて!,i don't owe you!,You can't say something like that!
...,...,...,...,...
1617,open-mantra-dataset\images/rasetugari/ja/053.jpg,こりゃかなわん,you beat me,It's impossible for that to happen.
1618,open-mantra-dataset\images/rasetugari/ja/053.jpg,さて,well!,Alright then.
1619,open-mantra-dataset\images/rasetugari/ja/053.jpg,さて明日は墓掃除だ!,,"Okay, tomorrow is going to be a funeral prepar..."
1620,open-mantra-dataset\images/rasetugari/ja/053.jpg,こ、こら,tomorrow we clean the graves!,"Hey, hey!"


#### Frames dataset

In [231]:
dataset = dataset_frame
#dataset = dataset_frame[:5]

# ollama running on local
client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
df, bleu4 = evaluate_model_unimodal_frames(client, dataset, prompt_text_oneshot_frames)
print(f"One shot:\nBLEU4 Score: {bleu4*100:.4f}")

Evaluating: 100%|██████████| 1069/1069 [06:39<00:00,  2.67it/s]

One shot:
BLEU4 Score: 2.9299





In [232]:
df

Unnamed: 0,frame_path,bubble_ja,bubble_en,predicted_translation
0,open-mantra-dataset_frames\images\tojime_no_si...,綴じ眼のシオラ,bound eye siora,A string of beads in her eye.
1,open-mantra-dataset_frames\images\tojime_no_si...,朽鷹みつき,Mitsuki Kuchitaka,Worm-eaten mitsuki.
2,open-mantra-dataset_frames\images\tojime_no_si...,だからっ,I'm telling you!!,So that's why?
3,open-mantra-dataset_frames\images\tojime_no_si...,知らないって言ってるだろっ,I don't know what you're talking about!,don't know that kind of thing!
4,open-mantra-dataset_frames\images\tojime_no_si...,そんな借金なんて!,i don't owe you!,that kind of debt doesn't exist!
...,...,...,...,...
1627,open-mantra-dataset_frames\images\rasetugari\j...,こりゃかなわん,you beat me,Helpless.
1628,open-mantra-dataset_frames\images\rasetugari\j...,さて,well!,"Ah, alright."
1629,open-mantra-dataset_frames\images\rasetugari\j...,さて明日は墓掃除だ!,,"Alright, tomorrow's grave cleaning duty awaits!"
1630,open-mantra-dataset_frames\images\rasetugari\j...,こ、こら,tomorrow we clean the graves!,"G-, go easy on me!"


#### Pages dataset

In [229]:
dataset = dataset_page
#dataset = dataset_page[:5]

# ollama running on local
client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
df, bleu4 = evaluate_model_unimodal_pages(client, dataset, prompt_text_oneshot_pages)
print(f"Zero shot:\nBLEU4 Score: {bleu4*100:.4f}")

Evaluating: 100%|██████████| 214/214 [04:15<00:00,  1.19s/it]

Zero shot:
BLEU4 Score: 3.7750





In [230]:
df

Unnamed: 0,page_path,bubble_ja,bubble_en,predicted_translation
0,open-mantra-dataset\images/tojime_no_siora/ja/...,綴じ眼のシオラ,bound eye siora,Siohra with a needle-like eye.
1,open-mantra-dataset\images/tojime_no_siora/ja/...,朽鷹みつき,Mitsuki Kuchitaka,Worm-eaten fingers and toes.
2,open-mantra-dataset\images/tojime_no_siora/ja/...,だからっ,I'm telling you!!,then is that so
3,open-mantra-dataset\images/tojime_no_siora/ja/...,知らないって言ってるだろっ,I don't know what you're talking about!,"he's saying he doesn't know, though?"
4,open-mantra-dataset\images/tojime_no_siora/ja/...,そんな借金なんて!,i don't owe you!,don't even mention such debt!
...,...,...,...,...
1625,open-mantra-dataset\images/rasetugari/ja/053.jpg,こりゃかなわん,you beat me,There's no way that can happen
1626,open-mantra-dataset\images/rasetugari/ja/053.jpg,さて,well!,Alright then
1627,open-mantra-dataset\images/rasetugari/ja/053.jpg,さて明日は墓掃除だ!,,"Anyway, tomorrow is gonna be a funeral day..."
1628,open-mantra-dataset\images/rasetugari/ja/053.jpg,こ、こら,tomorrow we clean the graves!,W-what?!


### Zero shot (Multimodal) 

#### Frames dataset

In [73]:
dataset = dataset_frame
dataset = dataset_frame[5:12]

# ollama running on local
client = OpenAI(base_url='http://localhost:11434/v1/', api_key='ollama')
df, bleu4 = evaluate_model_multimodal_frames(client, dataset, prompt_multimodal_text_zeroshot_frames, prompt_multimodal_vision_zeroshot_frames)
print(f"One shot:\nBLEU4 Score: {bleu4*100:.4f}")

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Loaded as API: http://localhost:8080/ ✔


Evaluating:  14%|█▍        | 1/7 [00:19<01:59, 19.87s/it]

Loaded as API: http://localhost:8080/ ✔


Evaluating:  29%|██▊       | 2/7 [00:41<01:44, 20.85s/it]

Loaded as API: http://localhost:8080/ ✔


Evaluating:  43%|████▎     | 3/7 [01:02<01:24, 21.13s/it]

Loaded as API: http://localhost:8080/ ✔


Evaluating:  57%|█████▋    | 4/7 [01:22<01:02, 20.71s/it]

Loaded as API: http://localhost:8080/ ✔


Evaluating:  71%|███████▏  | 5/7 [01:43<00:41, 20.56s/it]

Loaded as API: http://localhost:8080/ ✔


Evaluating:  86%|████████▌ | 6/7 [02:04<00:20, 20.73s/it]

Loaded as API: http://localhost:8080/ ✔


Evaluating: 100%|██████████| 7/7 [02:27<00:00, 21.11s/it]


--------Text input: ---------
 Frame 1:
Description: Frame description: A close-up of a character with closed eyes and raised eyebrows, suggesting an expression of surprise or shock. There is text in the speech bubbles indicating the character's thoughts or reaction.
Text: 1. こっちにゃ借用書があんだよ
2. トルティヤーノに借りた金はちゃんと返して貰わねぇと

Frame 2:
Description: A close-up of a character with closed eyes and raised eyebrows, suggesting an expression of surprise or shock. There is text in the speech bubbles indicating the character's thoughts or reaction.
Text: 3. 知るもんかっ

Frame 3:
Description: Frame description: A character with closed eyes and raised eyebrows stands in the foreground, expressing surprise or shock. The character's thoughts or reaction are conveyed through speech bubbles with text. In the background, another character is visible, though details are minimal due to the close-up focus on the first character.
Text: 4. 父親がカジノで作った借金なんて...

Frame 4:
Description: The background character's face is pa