In [None]:
import torch
import ctranslate2
import transformers
from transformers import AutoTokenizer
import nltk

# Download the specific resource needed for sentence tokenization
nltk.download('punkt_tab')

# It's also good practice to download the associated tokenizer data
nltk.download('punkt')

print(f"Is CUDA available? {torch.cuda.is_available()}")
print(f"Is Accelerate available? {transformers.utils.is_accelerate_available()}")
print(f"Is BitsAndBytes available? {transformers.utils.is_bitsandbytes_available()}")

In [None]:
import os

os.environ['HF_HOME'] = 'D:/huggingface_cache'

model_name = "facebook/nllb-200-distilled-1.3B"
ct2_output_dir = "D:/nllb-1.3B-ct2-int8" 

print("Starting safe conversion... (This will take a few minutes)")

converter = ctranslate2.converters.TransformersConverter(
    model_name_or_path=model_name,
    load_as_float16=True,      
    low_cpu_mem_usage=True
)

converter.convert(
    output_dir=ct2_output_dir, 
    quantization="float16",
    force=True 
)

print(f"Success! Model cleanly saved to {ct2_output_dir}")

In [None]:
ct2_model_path = ct2_output_dir 

print("Loading Tokenizer and CTranslate2 Engine...")

tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn")
translator = ctranslate2.Translator(ct2_model_path, device="cuda")

In [None]:
def translate_long_text(text, target_lang="tgl_Latn"):
    sentences = nltk.tokenize.sent_tokenize(text)
    
    source_tokens = [tokenizer.convert_ids_to_tokens(tokenizer.encode(sent)) for sent in sentences]
    target_prefix = [[target_lang]] * len(sentences)
    
    results = translator.translate_batch(
        source_tokens, 
        target_prefix=target_prefix,
        beam_size=1,                  
        repetition_penalty=1.1,       
        max_decoding_length=200
    )
    
    translated_text = ""
    for result in results:
        tokens = result.hypotheses[0][1:]
        
        text_chunk = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens), skip_special_tokens=True)
        
        text_chunk = text_chunk.replace("<unk>", "").strip() 
        
        translated_text += text_chunk + " "
        
    return translated_text.strip()

# text = "The dog is sleeping on the table."
long_article = """
The Ateneo de Manila University School of Law continues to trailblaze in specialized legal education in the Philippines. The Master of Laws (LL.M.) Program allows students to choose a field of concentration from the following areas: Intellectual Property, International Corporate and Business Law, International Economic Law, and Law International Human Rights Law.
 
The LL.M. Program serves as the focal point by which national and international legal scholars can pursue legal research and scholarship, with the LL.M. thesis as the programâ€™s major output.
"""

print("\nTranslating...")
final_result = translate_long_text(long_article, target_lang="pag_Latn")
print(f"\nResult:\n{final_result}")