# 1. Load dataset

In [357]:
from datasets import list_datasets, load_dataset

dataset = load_dataset("pubmed_qa", "pqa_labeled", split="train")
print(dataset)

397 prompt tokens used.
Done >>> 2023-06-27 14:13:20.579767
566 prompt tokens used.
Done >>> 2023-06-27 14:13:21.059638
1649 prompt tokens used.
Done >>> 2023-06-27 14:13:22.731170
469 prompt tokens used.
Done >>> 2023-06-27 14:13:22.801132


Found cached dataset pubmed_qa (/Users/giangnguyen/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


1402 prompt tokens used.
Done >>> 2023-06-27 14:13:23.147895
Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 1000
})


In [358]:
dataset.to_list()[:10]

255 prompt tokens used.
Done >>> 2023-06-27 14:13:23.251909


[{'pubid': 21645374,
  'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?',
  'context': {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
    'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD)

## Use third-party libs for translation & openai setup

In [359]:
import os
from dotenv import load_dotenv
import translators as ts
import openai

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

In [360]:
import requests
import uuid

# Add your key and endpoint
key = "d935eca9570b4272a090a3fc247a7fce"
endpoint = os.getenv("AZURE_TRANSLATION_ENDPOINT")

# Location, also known as region.
# Required if you're using a multi-service or regional (not global) resource.
# It can be found in the Azure portal on the Keys and Endpoint page.
location = "australiaeast"
print(location)

params = {
    'api-version': '3.0',
    'from': 'en',
    'to': ['vi']
}

headers = {
    'Ocp-Apim-Subscription-Key': key,
    'Ocp-Apim-Subscription-Region': location,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}


def translate_text(text):
    body = [{'text': text}]

    request = requests.post(endpoint, params=params, headers=headers, json=body)
    response = request.json()

    translated_text = response[0]['translations'][0]['text']
    return translated_text

australiaeast


# 2. Chat Completion API

In [361]:
def build_chat_prompt(text):
    translated_context = translate_text(text)
    return [
        {"role": "system", "content": """
         Act as a professional translator that can translate scientific sentences from English to Vietnamese based on a context.
         Do not translate any terms/phrases/acronyms you don't know.
         Only return the Vietnamese text after translation and no additional text or explanation.
         """},
        {"role": "user", "content": f"""
        Context: {translated_context}
        Input: {text}"""}]

In [362]:
from datetime import datetime


def chat_translate(text):
    try:
        chat_messages = build_chat_prompt(text)
        chat_completion = openai.ChatCompletion.create(model="gpt-3.5-turbo",
                                                       messages=chat_messages,
                                                       temperature=0.3,
                                                       top_p=1.0,
                                                       frequency_penalty=0.3,
                                                       presence_penalty=0.0)
        print(f'{chat_completion["usage"]["prompt_tokens"]} prompt tokens used.')
        print("Done >>> " + str(datetime.now()))
        return chat_completion.choices[0]["message"].content
    except Exception as error:
        return "Error: " + str(error)

### Sample translation

In [363]:
chat_output = chat_translate(dataset[0]['question'])
chat_output

157 prompt tokens used.
Done >>> 2023-06-27 14:13:25.952811
133 prompt tokens used.
Done >>> 2023-06-27 14:13:26.142065


'Có phải ty thể đóng một vai trò trong việc tái cơ cấu lá cây ren trong quá trình chết tế bào được lập trình không?'

In [364]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd

translated_dataset = []


def translate_sample(sample):
    print(sample['pubid'])
    return {
        'pubid': sample['pubid'],
        'question': chat_translate(sample['question']),
        'context': chat_translate("/n".join([context for context in sample["context"]["contexts"]])),
        'long_answer': chat_translate(sample['long_answer']),
        'final_decision': sample['final_decision'],
    }


step = 100
start = 0
num_samples = 1000
index = 1

while start < num_samples:
    end = min(start + step, num_samples)
    samples_to_translate = dataset.to_list()[start:end]

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(translate_sample, samples_to_translate))

    df = pd.DataFrame(results)
    df.to_csv('translated_dataset_pubmedqa.csv', index=False, mode='a', header=False)
    start += step

    print("Done round " + str(index) + " ! 🚀")
    index += 1
    if start >= num_samples:
        break


2164537416418930

9488747
17208539
10808977
23831910
26037986
26852225
17113061
10966337
25432938
18847643
18239988
25957366
24866606
26578404
120 prompt tokens used.
Done >>> 2023-06-27 14:13:28.060972
117 prompt tokens used.
Done >>> 2023-06-27 14:13:28.177029
111 prompt tokens used.
Done >>> 2023-06-27 14:13:28.201693
139 prompt tokens used.
Done >>> 2023-06-27 14:13:28.377367
118 prompt tokens used.
Done >>> 2023-06-27 14:13:28.466010
128 prompt tokens used.
Done >>> 2023-06-27 14:13:28.530739
133 prompt tokens used.
Done >>> 2023-06-27 14:13:28.539869
130 prompt tokens used.
Done >>> 2023-06-27 14:13:28.723069
283 prompt tokens used.
Done >>> 2023-06-27 14:13:28.944111
143 prompt tokens used.
Done >>> 2023-06-27 14:13:28.944908
152 prompt tokens used.
Done >>> 2023-06-27 14:13:29.121160
146 prompt tokens used.
Done >>> 2023-06-27 14:13:29.181732
157 prompt tokens used.
Done >>> 2023-06-27 14:13:29.216390
168 prompt tokens used.
Done >>> 2023-06-27 14:13:29.503839
135 prompt tokens


KeyboardInterrupt



# 3. Davinci instruct beta API

In [None]:
def build_instruct_prompt(text, context):
    return f"""
         Act as a professional translator specialized in natural science that translates input text from English to Vietnamese concisely.
         Do not translate words/terms you don't know, keep them in English.
         Only return the Vietnamese output after translation and no additional text or explanation.
         Input: {text} \n\n Output: """


def instruct_translate(prompts):
    try:
        response = openai.Completion.create(
            model="davinci-instruct-beta",
            prompt=prompts,
            temperature=0,
            top_p=1.0,
            frequency_penalty=0.3,
            presence_penalty=0.0,
            max_tokens=1000
        )
        print(f'{response["usage"]["prompt_tokens"]} prompt tokens used.')
        # translated_text = ts.translate_text(response.choices[0].text, source_language="en", target_language="vi")
        # print(translated_text)
        return response.choices[0].text
    except Exception as error:
        print("Error: " + str(error))


### Sample translation

In [None]:
instruct_messages = build_instruct_prompt(dataset[0]["long_answer"], dataset[0]["context"]["contexts"][0])
instruct_output = instruct_translate(instruct_messages)
instruct_output

Note: 1 single request seems to work but spamming the server (this libs uses Bing translate) return `too many requests error`

In [None]:
# from concurrent.futures import ThreadPoolExecutor
# import pandas as pd
# from datetime import datetime
# translated_questions=[]
# translated_answers=[]
#
# for i in range(min(len(dataset), 100)):
#     try:
#         question = dataset[i]["question"]
#         answer = dataset[i]["long_answer"]
#         translated_question = ts.translate_text(question, from_language='en', to_language='vi')
#         translated_answer = ts.translate_text(answer, from_language='en', to_language='vi')
#         print('successsss 🎊')
#         translated_questions.append(translated_question)
#         translated_answers.append(translated_answer)
#     except:
#         translated_questions.append("Error")
#         translated_answers.append("Error")
# dataset_1=dataset.select(range(min(len(dataset), 100)))
#
# dataset_1 = dataset_1.add_column("translated_question", translated_questions)
# dataset_1 = dataset_1.add_column("translated_answers", translated_answers)
#
# df = pd.DataFrame(dataset_1)
# df.to_csv("translated_pubmedqa_1.csv", index=False)
