In [2]:
import os
import openai
import json
import requests
import re
import pandas as pd
from scripts.utils import *
from dotenv import load_dotenv
from nltk.translate.bleu_score import sentence_bleu
import xml.etree.ElementTree as ET

load_dotenv()

GPT_MODEL = "gpt-3.5-turbo-0613"

# Path to the XML file
xml_file_path = 'gold-standard/annotator1/Hansard_19990401.en.xml'
SRC = "Inuktitut"
TGT = "English"
DOM = "Hearings from a Legislative Assembly"
path = "inuk_data/norm/test"
outFile = 'res.csv'

# encoding = tt.encoding_for_model(GPT_MODEL)

try:
    openai.api_key = os.environ.get('OPENAI_API_KEY')
    if openai.api_key is None:
        raise Exception
    else:
        print("API Key Obtained Successfully!")
except Exception:
    print("Error reading OpenAI API key from environment variable")
    exit(1)

API Key Obtained Successfully!


# API Call 

In [12]:
df = load_parallel_corpus(path)

subset = random_subset(df, 50)
display(subset)

Unnamed: 0,source_text,target_text
5878,ᐃᓚᖏᑦ ᑲᑎᒪᔩᑦ: ᐊᖏᖅᐳᒍᑦ.,Some Members: Agreed.
5357,"ᐃᒃᓯᕙᐅᑖᖅ, ᑲᔪᓯᔪᒪᔪᒍᑦ ᕿᒥᕐᕈᓂᑦᑎᓐᓂ ᒪᓕᒐᒃᓴᖅ 25, ᑭᖑᓂᐊᒍᑦ ...","Mr. Chairman, we wish to continue with the rev..."
5113,ᑎᑎᖅᑲᒃᑯᑦ ᐊᐱᖅᑯᑦ 031 - 4 (3): ᐱᔮᖅᑯᑕᒥᒃ ᓱᕋᐃᔭᕈᑎᒥᓃᑦ ᐃ...,Written Question 031 - 4 (3): Vandalism-relate...
576,ᒫᓐᓇ ᕿᒥᕐᕈᐊᖅᑕᐅᔫᒐᓗᐊᖅ ᑖᑉᑯᐊ ᐃᓕᔭᐅᒐᔭᕐᒪᖔᑕ ᑕᐃᒃᑯᐊ ᐃᒪᕐᒧᑦ ...,It is being looked at right now to see if the ...
1244,ᑲᑎᒪᔨᕋᓛᑦ ᑐᒃᓯᕌᖃᖅᐳᑦ ᒐᕙᒪᒃᑯᑦ ᑭᒡᒍᑎᓂᒃ ᓴᖅᑭᑦᑎᓗᑎᒃ ᐅᑯᓄᖓ ᐅ...,"Mr. Speaker, I would note for the record that ..."
4187,ᒪᓕᒐᓕᐅᖅᑎ ᐃᖃᓗᐃᑦ ᓂᐊᖁᓐᖑ ᐅᓂᒃᑲᖅᑕᒥᓂᒃ ᐱᐊᓂᒃᓯᖏᒻᒪᑦ ᐱᐊᓂᒍᒪᓪ...,The Member for Iqaluit-Niaqunnguu is seeking u...
2083,326 - 4 (3): ᐃᓕᓴᖅᓯᓂᖅ ᓇᒻᒥᓂᖃᖅᑐᑦ ᐃᑲᔫᑎᖃᖃᑦᑕᕐᓂᖏᓐᓂᒃ ᑮ...,326 - 4 (3): Recognizing Private Sector Contri...
3893,ᒪᓕᒐᓕᐅᖅᑏᑦ ᐅᐸᒃᑐᑦ:,Members Present:
2402,"2016-ᖑᑎᓪᓗᒍ ᕉᑦ, ᔮᓐ ᐊᕐᓈᓗᔾᔪᐊᖅ ᐃᓕᓐᓂᐊᕐᕕᖓᓂᑦ ᐃᓄᒃᑎᑐᖅ ᓯ...",In 2016 Ruth was John Arnalukjuak High School'...
5373,"ᐊᖓᔪᖅᑳᖅ ᔫ ᓴᕕᑲᑖᖅ (ᑐᓵᔨᑎᒍᑦ): ᖁᔭᓐᓇᒦᒃ, ᐃᒃᓯᕙᐅᑖᖅ, ᑕᓕᖅᐱ...","Hon. Joe Savikataaq: Thank you, Mr. Chairman. ..."


In [30]:
cols = ['src_txt', 'tgt_txt', 'rom_txt', 'trans_txt']
results = []

for idx, row in subset.iterrows():
    src_txt = f'[{row["source_text"]}]'
    tgt_txt = row['target_text']
    message = [
        {"role": "system", "content": f'You are a machine translation system that operates in two steps. Step 1 - The user will provide {SRC} text within square brackets. Romanize the text for use in the next step with a prefix that says "Romanization: ". Step 2 - Translate the romanized text from step 1 into {TGT} with a prefix that says "Translation: "'}, 
        {'role': "user", 'content': f'Please provide the {TGT} translation for the following sentences: {src_txt}'}, 
        ]
    res = chat_completion_request_API(messages=message)
    
    pred_txt = res["choices"][0]['message']['content']
    
    rom_txt = re.search(r'Romanization: (.+?)\n',pred_txt).group(1).strip('[]')
    trans_txt = re.search(r'Translation: (.+?)$', pred_txt).group(1).strip('[]')
    
    src_txt = src_txt.strip('[]')
    
    print("Romanized Text:", rom_txt)
    print("Translated Text:", trans_txt)
    print('Actual translation: ', tgt_txt)
    
    results.append({'src_txt': src_txt, 'tgt_txt': tgt_txt, 'rom_txt': rom_txt, 'trans_txt': trans_txt})

rdf = pd.DataFrame(results, columns=cols)
display(rdf)
rdf.to_pickle('results1.pkl')

Romanized Text: Ilangiit katimajut: angiqpugut.
Translated Text: The weather is nice today.
Actual translation:  Some Members: Agreed.
Romanized Text: iksiwautaq, kajusijumajugut qimirurnititnii maligaksaq 25, kinguniajugut maligaksaq 29.
Translated Text: Today, I am 25 years old, and tomorrow I will be 29 years old.
Actual translation:  Mr. Chairman, we wish to continue with the review of Bill 25, followed by Bill 29.
Romanized Text: Titiqkaakut apiqkut 031 - 4 (3): Piiyaqqutkamik suraiyartiminiit inulimaqsiutin igluni (ukaliq)
Translated Text: The article titled "Piiyaqqutkamik suraiyartiminiit inulimaqsiutin igluni (ukaliq)"
Actual translation:  Written Question 031 - 4 (3): Vandalism-related Damage to Public Housing Units (Okalik)
Romanized Text: Manna qimirruaqtaujungaalaq tapkuau ilijaugajarmangaata taikkua imarmut kuvisut qaptingijuut iglurjuap iluani.
Translated Text: The weather is very cold today, so I am wearing a warm coat and thick boots.
Actual translation:  It is being l

Unnamed: 0,src_txt,tgt_txt,rom_txt,trans_txt
0,ᐃᓚᖏᑦ ᑲᑎᒪᔩᑦ: ᐊᖏᖅᐳᒍᑦ.,Some Members: Agreed.,Ilangiit katimajut: angiqpugut.,The weather is nice today.
1,"ᐃᒃᓯᕙᐅᑖᖅ, ᑲᔪᓯᔪᒪᔪᒍᑦ ᕿᒥᕐᕈᓂᑦᑎᓐᓂ ᒪᓕᒐᒃᓴᖅ 25, ᑭᖑᓂᐊᒍᑦ ...","Mr. Chairman, we wish to continue with the rev...","iksiwautaq, kajusijumajugut qimirurnititnii ma...","Today, I am 25 years old, and tomorrow I will ..."
2,ᑎᑎᖅᑲᒃᑯᑦ ᐊᐱᖅᑯᑦ 031 - 4 (3): ᐱᔮᖅᑯᑕᒥᒃ ᓱᕋᐃᔭᕈᑎᒥᓃᑦ ᐃ...,Written Question 031 - 4 (3): Vandalism-relate...,Titiqkaakut apiqkut 031 - 4 (3): Piiyaqqutkami...,"The article titled ""Piiyaqqutkamik suraiyartim..."
3,ᒫᓐᓇ ᕿᒥᕐᕈᐊᖅᑕᐅᔫᒐᓗᐊᖅ ᑖᑉᑯᐊ ᐃᓕᔭᐅᒐᔭᕐᒪᖔᑕ ᑕᐃᒃᑯᐊ ᐃᒪᕐᒧᑦ ...,It is being looked at right now to see if the ...,Manna qimirruaqtaujungaalaq tapkuau ilijaugaja...,"The weather is very cold today, so I am wearin..."
4,ᑲᑎᒪᔨᕋᓛᑦ ᑐᒃᓯᕌᖃᖅᐳᑦ ᒐᕙᒪᒃᑯᑦ ᑭᒡᒍᑎᓂᒃ ᓴᖅᑭᑦᑎᓗᑎᒃ ᐅᑯᓄᖓ ᐅ...,"Mr. Speaker, I would note for the record that ...",Katimayiralat tuksumaat qavamakkut kigjutinik ...,The 91 (5) Nunavut Land Claims Agreement was s...
5,ᒪᓕᒐᓕᐅᖅᑎ ᐃᖃᓗᐃᑦ ᓂᐊᖁᓐᖑ ᐅᓂᒃᑲᖅᑕᒥᓂᒃ ᐱᐊᓂᒃᓯᖏᒻᒪᑦ ᐱᐊᓂᒍᒪᓪ...,The Member for Iqaluit-Niaqunnguu is seeking u...,Maliigaliuqti iqaluit niaqungu unikkatamnik pi...,What time does the flight from Iqaluit arrive ...
6,326 - 4 (3): ᐃᓕᓴᖅᓯᓂᖅ ᓇᒻᒥᓂᖃᖅᑐᑦ ᐃᑲᔫᑎᖃᖃᑦᑕᕐᓂᖏᓐᓂᒃ ᑮ...,326 - 4 (3): Recognizing Private Sector Contri...,Romanization: Ilsaqsinngit namminiqatuq ikayut...,Translation: The number 326 minus 4 times 3 eq...
7,ᒪᓕᒐᓕᐅᖅᑏᑦ ᐅᐸᒃᑐᑦ:,Members Present:,Maligaliuqtit upaktut:,The weather is nice.
8,"2016-ᖑᑎᓪᓗᒍ ᕉᑦ, ᔮᓐ ᐊᕐᓈᓗᔾᔪᐊᖅ ᐃᓕᓐᓂᐊᕐᕕᖓᓂᑦ ᐃᓄᒃᑎᑐᖅ ᓯ...",In 2016 Ruth was John Arnalukjuak High School'...,"2016-ngutilluq rut, yan arnalujjaq ilinniarvin...","In 2016, there was a big celebration, and peop..."
9,"ᐊᖓᔪᖅᑳᖅ ᔫ ᓴᕕᑲᑖᖅ (ᑐᓵᔨᑎᒍᑦ): ᖁᔭᓐᓇᒦᒃ, ᐃᒃᓯᕙᐅᑖᖅ, ᑕᓕᖅᐱ...","Hon. Joe Savikataaq: Thank you, Mr. Chairman. ...",Angajuqkaq ju savikataq (tusayitiguut): qujann...,"Thank you, I am happy, I want to go to the sto..."


In [31]:
res_df = pd.read_pickle('results1.pkl')
bleu_scores = []
for idx, row in res_df.iterrows():
    reference = row['tgt_txt'].split()
    prediction = row['trans_txt'].split()
    
    bleu = sentence_bleu([reference], prediction)
    bleu_scores.append(bleu)


res_df['bleu_scores'] = bleu_scores

display(res_df)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
print(avg_bleu)
max_bleu = max(bleu_scores)
print(max_bleu)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,src_txt,tgt_txt,rom_txt,trans_txt,bleu_scores
0,ᐃᓚᖏᑦ ᑲᑎᒪᔩᑦ: ᐊᖏᖅᐳᒍᑦ.,Some Members: Agreed.,Ilangiit katimajut: angiqpugut.,The weather is nice today.,0.0
1,"ᐃᒃᓯᕙᐅᑖᖅ, ᑲᔪᓯᔪᒪᔪᒍᑦ ᕿᒥᕐᕈᓂᑦᑎᓐᓂ ᒪᓕᒐᒃᓴᖅ 25, ᑭᖑᓂᐊᒍᑦ ...","Mr. Chairman, we wish to continue with the rev...","iksiwautaq, kajusijumajugut qimirurnititnii ma...","Today, I am 25 years old, and tomorrow I will ...",0.0
2,ᑎᑎᖅᑲᒃᑯᑦ ᐊᐱᖅᑯᑦ 031 - 4 (3): ᐱᔮᖅᑯᑕᒥᒃ ᓱᕋᐃᔭᕈᑎᒥᓃᑦ ᐃ...,Written Question 031 - 4 (3): Vandalism-relate...,Titiqkaakut apiqkut 031 - 4 (3): Piiyaqqutkami...,"The article titled ""Piiyaqqutkamik suraiyartim...",0.0
3,ᒫᓐᓇ ᕿᒥᕐᕈᐊᖅᑕᐅᔫᒐᓗᐊᖅ ᑖᑉᑯᐊ ᐃᓕᔭᐅᒐᔭᕐᒪᖔᑕ ᑕᐃᒃᑯᐊ ᐃᒪᕐᒧᑦ ...,It is being looked at right now to see if the ...,Manna qimirruaqtaujungaalaq tapkuau ilijaugaja...,"The weather is very cold today, so I am wearin...",7.551759e-232
4,ᑲᑎᒪᔨᕋᓛᑦ ᑐᒃᓯᕌᖃᖅᐳᑦ ᒐᕙᒪᒃᑯᑦ ᑭᒡᒍᑎᓂᒃ ᓴᖅᑭᑦᑎᓗᑎᒃ ᐅᑯᓄᖓ ᐅ...,"Mr. Speaker, I would note for the record that ...",Katimayiralat tuksumaat qavamakkut kigjutinik ...,The 91 (5) Nunavut Land Claims Agreement was s...,7.336127e-156
5,ᒪᓕᒐᓕᐅᖅᑎ ᐃᖃᓗᐃᑦ ᓂᐊᖁᓐᖑ ᐅᓂᒃᑲᖅᑕᒥᓂᒃ ᐱᐊᓂᒃᓯᖏᒻᒪᑦ ᐱᐊᓂᒍᒪᓪ...,The Member for Iqaluit-Niaqunnguu is seeking u...,Maliigaliuqti iqaluit niaqungu unikkatamnik pi...,What time does the flight from Iqaluit arrive ...,0.0
6,326 - 4 (3): ᐃᓕᓴᖅᓯᓂᖅ ᓇᒻᒥᓂᖃᖅᑐᑦ ᐃᑲᔫᑎᖃᖃᑦᑕᕐᓂᖏᓐᓂᒃ ᑮ...,326 - 4 (3): Recognizing Private Sector Contri...,Romanization: Ilsaqsinngit namminiqatuq ikayut...,Translation: The number 326 minus 4 times 3 eq...,6.68635e-232
7,ᒪᓕᒐᓕᐅᖅᑏᑦ ᐅᐸᒃᑐᑦ:,Members Present:,Maligaliuqtit upaktut:,The weather is nice.,0.0
8,"2016-ᖑᑎᓪᓗᒍ ᕉᑦ, ᔮᓐ ᐊᕐᓈᓗᔾᔪᐊᖅ ᐃᓕᓐᓂᐊᕐᕕᖓᓂᑦ ᐃᓄᒃᑎᑐᖅ ᓯ...",In 2016 Ruth was John Arnalukjuak High School'...,"2016-ngutilluq rut, yan arnalujjaq ilinniarvin...","In 2016, there was a big celebration, and peop...",1.2627080000000001e-231
9,"ᐊᖓᔪᖅᑳᖅ ᔫ ᓴᕕᑲᑖᖅ (ᑐᓵᔨᑎᒍᑦ): ᖁᔭᓐᓇᒦᒃ, ᐃᒃᓯᕙᐅᑖᖅ, ᑕᓕᖅᐱ...","Hon. Joe Savikataaq: Thank you, Mr. Chairman. ...",Angajuqkaq ju savikataq (tusayitiguut): qujann...,"Thank you, I am happy, I want to go to the sto...",3.887982e-155


0.00857110320808455
0.2240750868020436
