In [1]:
import json
import numpy as np
import pandas as pd
import pprint as p
from wtpsplit import SaT

**This notebook goes through the flow of processing en and vi text data**

# Segment text data into sentences, for translation to vietnamese of each sentence

In [None]:
# path to the json data file
json_path = "../data/json/20250509_corpus_pubtator_output.json"
f = open(json_path)
data = json.load(f)

In [None]:
# Initialize the SaT object for sentence segmentation.
sat_sm = SaT("sat-12l-sm")

In [None]:
# Separate each sentence in each data point by "\n"

for (i, data_point) in enumerate(data):
    print(i)
    dummy = data[i]["data"]["text"]

    en_sents = sat_sm.split(dummy, strip_whitespace=True)
    en_sents = "\n".join(en_sents)

    data[i]["data"]["text"] = en_sents

with open("../data/json/sentence_pubtator_output.json", "w") as outfile:
    json.dump(data, outfile, indent=2)

# Translation. Do not run since translation has been done. Model: vinai/vinai-translate-en2vi-v2

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sys
import json
import pandas as pd
import pprint
import torch

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# 1. Load the tokenizer and model for en→vi translation
tokenizer_en2vi = AutoTokenizer.from_pretrained(
    "vinai/vinai-translate-en2vi-v2",
    src_lang="en_XX"
)
model_en2vi = AutoModelForSeq2SeqLM.from_pretrained(
    "vinai/vinai-translate-en2vi-v2"
)

# using CPU is not recommended
device_en2vi = torch.device("cuda")
model_en2vi.to(device_en2vi)

In [None]:
def translate_en2vi(en_text: str) -> str:
    """
    Translate a single English sentence into Vietnamese.
    """
    # Tokenize the input sentence, with padding/truncation
    inputs = tokenizer_en2vi(
        en_text,
        return_tensors="pt",
        padding=True
    )
    # This part can be removed since we are translating sentence by sentence.
    # Thus, the input length will not exceed the max length.
    if len(inputs['input_ids'][0]) > 1024:
        sents = en_text.split(". ")
        text1 = ". ".join(sents[:5]) + "."
        text2 = ". ".join(sents[5:]) + "."
        vi_text1 = translate_en2vi(text1)
        vi_text2 = translate_en2vi(text2)
        vi_text = vi_text1 + " " + vi_text2
    else:
        # Generate translation with beam search
        output_ids = model_en2vi.generate(
            inputs.input_ids.to(device_en2vi),
            decoder_start_token_id=tokenizer_en2vi.lang_code_to_id["vi_VN"],
            num_return_sequences=1,
            num_beams=5,
            early_stopping=True,
        )
        # Decode the generated IDs back to text
        vi_text = tokenizer_en2vi.batch_decode(
            output_ids,
            skip_special_tokens=True
        )[0]
    return vi_text

In [None]:
path = "../data/json/sentence_pubtator_output.json"
f = open(path)
data = json.load(f)
# Sample: translating the first 100 sentences
translate = data[0:100]

In [None]:
total = len(translate)

for (i, data_point) in enumerate(translate):
    print((i + 1) / total * 100)
    preds = []
    en_text = translate[i]["data"]["text"]
    en_sents = en_text.split("\n")
    for en_sent in en_sents:
        vi_text_pred = translate_en2vi(en_sent)
        preds.append(vi_text_pred)

    vi_text = "\n".join(preds)
    translate[i]["data"]["vi_text"] = vi_text
    

In [None]:
# Current: 0 -> 99

out_path = "../data/json/translation0-99.json"
with open(out_path, "w") as outfile:
    json.dump(translate, outfile, indent=2)

# Replacing "\n" with "\n\n" for better UI in label studio and re-label the tagging indices

**Tips for reproducing the result:**

3 datapoints that respectively contain these keywords: "Differential transcriptome expression in human nucleus", "Patient - Physician Discordance in Global Assessment", "Bioinformatic analysis of RNA-seq data" were manually translated afterward.

For reproduction, start from the beginning of this notebook, and redo the translation.

In [2]:
# load and concat all translation data
json_path0 = "../data/json/translation0-99.json"
json_path1 = "../data/json/translation100-2999.json"
json_path2 = "../data/json/translation3000-all.json"
f0 = open(json_path0)
f1 = open(json_path1)
f2 = open(json_path2)
data0 = json.load(f0)
data1 = json.load(f1)
data2 = json.load(f2)
data = data0 + data1 + data2
len(data)

4392

In [3]:
# Change field name from "text" to "english_text"
for (i, data_point) in enumerate(data):
    data[i]["data"]["en_text"] =  data[i]["data"].pop("text")
    for (j, labeling) in enumerate(data_point["predictions"][0]['result']):
        data[i]["predictions"][0]['result'][j]['from_name'] = "en_label"
        data[i]["predictions"][0]['result'][j]['to_name'] = "en_text"

In [4]:
# Check for length mismatch between the number en and vi sentences in each data point
for (i, data_point) in enumerate(data):
    en_text = data[i]["data"]["en_text"]
    en_sents = en_text.split("\n")
    vi_text = data[i]["data"]["vi_text"]
    vi_sents = vi_text.split("\n")
    if len(en_sents) != len(vi_sents):
        print("Length mismatch")
        print(f"en: {len(en_sents)}")
        print(f"vi: {len(vi_sents)}")
        print(f"i: {i}")

In [None]:
# Algo to update the start and end index of each labeling
for (i, data_point) in enumerate(data):
    dummy = data[i]["data"]["en_text"]

    for (j, labeling) in enumerate(data_point["predictions"][0]['result']):
        start_index = labeling['value']['start']
        temp = dummy[:start_index]
        c = temp.count(".\n")
        # c = temp.count(". ")
        data[i]["predictions"][0]['result'][j]['value']['start'] += c
        data[i]["predictions"][0]['result'][j]['value']['end'] += c

In [None]:
# Replace .\n with .\n\n
for (i, data_point) in enumerate(data):
    data[i]["data"]["en_text"] = data[i]["data"]["en_text"].replace("\n", "\n\n")
    data[i]["data"]["vi_text"] = data[i]["data"]["vi_text"].replace("\n", "\n\n")

In [None]:
with open("/Users/vubinhminh/Work/Machine Translation/data/json/relabeled_corpus_pubtator_output.json", "w") as outfile:
    json.dump(data, outfile, indent=2)

# Formatting as an excel file

In [None]:
import pandas as pd
import os
import json
import re

In [None]:
f = open("../data/json/relabeled_corpus_pubtator_output.json", encoding='utf-8')
# f = open("../data/json/mapped_corpus_pubtator_output.json", encoding='utf-8')
data = json.load(f)

In [None]:
# Load relevant data
codes = []
editor_names = []
english = []
vietnamese = []
en_word_counts = []
vi_word_counts = []
edited_versions = []
prof_chien_checking = []
notes = []
urls = []

for (i, data_point) in enumerate(data):
    # print(i+1)
    en_text = data[i]["data"]["en_text"]
    vi_text = data[i]["data"]["vi_text"]
    en_sents = en_text.split("\n\n")
    vi_sents = vi_text.split("\n\n")
    english.extend(en_sents)
    vietnamese.extend(vi_sents)
    local_en_word_counts = [len(re.findall(r'\b\w+\b', en_sent)) for en_sent in en_sents]
    local_vi_word_counts = [len(re.findall(r'\b\w+\b', vi_sent)) for vi_sent in vi_sents]
    en_word_counts.extend(local_en_word_counts)
    vi_word_counts.extend(local_vi_word_counts)
    sents_len = len(en_sents)
    if len(vi_sents) != len(en_sents):
        print(f"Error: {i}th data point has different number of sentences")
        print(f"{len(en_sents)} English: {en_text}")
        print(f"{len(vi_sents)} Vietnamese: {vi_text}")
        break
    code = f"item {i+1:04d}"
    local_codes = [code] * sents_len
    codes.extend(local_codes)
    dummy = [''] * sents_len
    editor_names.extend(dummy)
    edited_versions.extend(dummy)
    prof_chien_checking.extend(dummy)
    notes.extend(dummy)
    urls.extend(dummy)

In [None]:
# Create pd DataFrame
df = pd.DataFrame([en_word_counts, vi_word_counts, editor_names, english, vietnamese, edited_versions, prof_chien_checking, notes, codes, urls]).T
df.columns = columns
df.head()

In [None]:
# Write pd DataFrame to an excel file
xlsx_path = "/Users/vubinhminh/Work/Machine Translation/data/xlsx/translated.xlsx"
with pd.ExcelWriter(xlsx_path, engine='xlsxwriter') as writer:
        df.to_excel(writer, index=False)
        workbook = writer.book
        worksheet = writer.sheets['Sheet1']
        
        format_wrap = workbook.add_format({'text_wrap': True, 'valign': 'vcenter'})
        
        column_widths = [12, 12, 12, 50, 50, 50, 50, 20, 8, 20]
        
        for col_num, (col, width) in enumerate(zip(df.columns, column_widths)):
            worksheet.set_column(col_num, col_num, width, format_wrap)
        
        for row_num in range(len(df) + 1):
            worksheet.set_row(row_num, None, format_wrap)