In [1]:
from datasets import load_dataset
raw_datasets = load_dataset("Helsinki-NLP/opus-100", "en-id")
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [2]:
model_mbart = 'facebook/mbart-large-50-one-to-many-mmt'

from transformers import MBart50TokenizerFast
tokenizer = MBart50TokenizerFast.from_pretrained(model_mbart, src_lang="en_XX", tgt_lang="id_ID")

# Data Augmentation #

In [None]:
from nltk.corpus import wordnet
import string
import random

def ocr_error(text):
  """
  Introduces random character errors (substitutions, insertions, deletions) to simulate OCR errors.
  """
  probability = 0.1  # Adjust probability to control error frequency
  tokenized_text = tokenizer.tokenize(text, max_length=512, truncation=True)
  for i in range(len(tokenized_text)):  # Iterate by index
    if random.random() < probability:
      error_type = random.choice(["substitute", "insert", "delete"])

      if tokenized_text and i < len(tokenized_text) and len(tokenized_text[i]) > 1:  # Check existence, index, and length
        random_char_position = random.choice(range(len(tokenized_text[i])))
      else:
        continue  # Skip error simulation for empty or single-character tokens

      if error_type == "substitute":
        # Substitute with a random character, avoiding IndexError
        if i < len(tokenized_text) - 1:  # Check remaining tokens
          tokenized_text[i] = tokenized_text[i][:random_char_position] + \
                             random.choice(list(string.ascii_letters)) + \
                             tokenized_text[i][random_char_position+1:]
      elif error_type == "insert":
        # Insert a random character before the current character
        tokenized_text[i] = tokenized_text[i][:random_char_position] + \
                             random.choice(list(string.ascii_letters)) + \
                             tokenized_text[i][random_char_position:]
      elif error_type == "delete":
        # Delete the current character but avoid empty list
        if isinstance(tokenized_text[i], str):
          # If it's a single-character string, skip deletion (avoid empty list)
          continue
        else:
          tokenized_text[i] = tokenized_text[i][:random_char_position] + \
                              tokenized_text[i][random_char_position + 1:]

  return "".join(tokenized_text)

# Text Normalization #

In [14]:
import re
def normalize_text(text):
    text = text.lower()  # Lowercase
    # text = re.sub(r"[^\w\s]", "", text)  # Remove non-alphanumeric characters (except whitespace)
    # text = re.sub(r"\s+", " ", text)  # Replace excess whitespace with single space
    return text

In [11]:
raw_datasets["train"]["translation"][10:20]

[{'en': 'Hut!', 'id': 'Pondok!'},
 {'en': 'Whoa, whoa, whoa.', 'id': 'Whoa, whoa, whoa.'},
 {'en': 'Cut it out.', 'id': 'Hentikan itu.'},
 {'en': 'The law is clear.', 'id': 'Hukumnya sangat jelas.'},
 {'en': 'You coming up?', 'id': 'Kau mau ikutan?'},
 {'en': 'Cheers.', 'id': 'Bersulang.'},
 {'en': 'And to Him belongs whoever is in the heavens and earth.',
  'id': 'Dan kepunyaan-Nya-lah siapa saja yang ada di langit dan di bumi.'},
 {'en': 'From your pocket.', 'id': 'Dari sakumu.'},
 {'en': 'Mm.', 'id': 'Hmm.'},
 {'en': 'My men will attack his southern border.',
  'id': 'Pasukanku akan menyerang perbatasan selatan negaranya.'}]

In [15]:
inputs = [normalize_text(dt["en"]) for dt in raw_datasets["train"]["translation"]]
targets = [normalize_text(dt["id"]) for dt in raw_datasets["train"]["translation"]]

# PreProcessing #

In [None]:
source_lang = "en"
target_lang = "id"
def preprocess(data):
  inputs = [dt[source_lang] for dt in data["translation"]]
  targets = [dt[target_lang] for dt in data["translation"]]

  # Apply augmentation to source language text
  augmented_inputs = [ocr_error(text) for text in inputs]

  model_inputs = tokenizer(augmented_inputs, truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenized_datasets = raw_datasets.map(preprocess, batched=True)

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

# Batch translation with prefix #

In [None]:
from transformers import MarianMTModel, MarianTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import LineTokenizer
import math

model_path = '..\Machine Learning\model\opus-mt-en-id-finetuned-en-to-id'
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)
model = model.cuda()

input_text = "In the development of our translation application, we have decided to use a web-based approach. We will implement this application with a client-server architecture, where the client will interact with the application through a user-friendly User Interface (UI), while the server will handle requests from the client, process them, and return the results. We chose web development frameworks, such as Flask for the backend and Streamlit for the frontend, to speed up and simplify the application development process. The UI will be designed with a focus on ease of use and intuitive navigation, with responsive design in mind so that the application can be used comfortably on a variety of devices, including desktop and mobile devices."
# input_text = "In the development of our translation application, we have decided to use a web-based approach. We will implement this application with a client-server architecture, where the client will interact with the application through a user-friendly User Interface (UI), while the server will handle requests from the client, process them, and return the results. We chose a web development framework, Streamlit UI will be designed with a focus on ease of use and intuitive navigation, with responsive design in mind so that the application can be used comfortably on various devices, including desktop and mobile devices. Our app will offer three main services: OCR, Machine Translation (MT), and a combination of OCR & MT. Each service will have its own UI that allows users to select the desired model and submit input. For OCR services, users will be able to select the desired OCR model and upload PDF or Word format documents as input. The application will process the uploaded file using the selected OCR model and display the results to the user or allow the user to download the results. Next, for the Machine Translation (MT) service, the user will select the desired MT model and select the source language and target language for translation. The user will then enter text as input through the UI, and the application will translate the text using the selected MT model, displaying the results to the user. Lastly is the combination service (OCR → MT), where the user will select the desired OCR and MT models, as well as the source language and target language for translation. The user will upload a PDF or Word format document as input, and the application will use the OCR model to recognize the text from the uploaded file, then translate the text using the selected MT model. The translation results will be displayed to the user or can be downloaded."

lt = LineTokenizer()
batch_size = 8

paragraphs = lt.tokenize(input_text)
translated_paragraphs = []
prefix = ">>ind<< "

for paragraph in paragraphs:
    sentences = sent_tokenize(paragraph)
    batches = math.ceil(len(sentences) / batch_size)
    translated = []
    for i in range(batches):
        sent_batch = [prefix + sentence for sentence in sentences[i*batch_size:(i+1)*batch_size]]
        print(sent_batch)
        print(len(sent_batch))
        model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to('cuda')
        translated_batch = model.generate(**model_inputs)
        translated += translated_batch
    translated = tokenizer.batch_decode(translated, skip_special_tokens=True)
    translated_paragraphs += [" ".join(translated)]

translated_text = "\n".join(translated_paragraphs)