In [1]:
#!pip install transformers sentencepiece

In [2]:
from transformers import MarianMTModel, MarianTokenizer

def get_model_tokenizer(src_lang="en", tgt_lang="hi"):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

In [3]:
def translate_text(text, tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translation = model.generate(**tokens)
    translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)
    return translated_text

In [4]:
# Load translator: English ➜ Hindi
tokenizer, model = get_model_tokenizer("en", "hi")

#en→hi (Hindi)
#en→ta (Tamil)
#en→fr, en→de, en→es (European)
#en→zh (Chinese)

# Sample insurance sentence
text = "Please submit your claim form within 30 days from the date of incident."
translated = translate_text(text, tokenizer, model)

print("Original:", text)
print("Hindi:", translated)



Original: Please submit your claim form within 30 days from the date of incident.
Hindi: इस घटना की तारीख से 30 दिन के अंदर अपना दावा ज़ाहिर कीजिए ।


In [5]:
#!pip install transformers sentencepiece python-docx PyMuPDF

In [6]:
import fitz  # PyMuPDF
import docx
from docx import Document
from transformers import MarianMTModel, MarianTokenizer

In [7]:
# ---------- Load Translation Model ----------
def load_translation_model(src_lang="en", tgt_lang="hi"):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

In [8]:
# ---------- Translate Text ----------
def translate_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [9]:
# ---------- Extract Text ----------
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

In [10]:
# ---------- Save to New PDF ----------
def save_to_pdf(text, output_path):
    doc = fitz.open()
    page = doc.new_page()
    text_lines = text.split("\n")
    y = 72
    for line in text_lines:
        page.insert_text((72, y), line, fontsize=11)
        y += 14
    doc.save(output_path)

In [11]:
# ---------- Save to New DOCX ----------
def save_to_docx(text, output_path):
    doc = Document()
    for para in text.split("\n"):
        doc.add_paragraph(para)
    doc.save(output_path)

In [12]:
def translate_and_save(file_path, file_type, target_lang="hi"):
    # Step 1: Define allowed Hugging Face target languages
    supported_langs = ["hi", "fr", "es", "de", "zh"]  # Hindi, French, Spanish, German, Chinese

    if target_lang not in supported_langs:
        raise ValueError(
            f"❌ '{target_lang}' is not currently supported.\n"
            f"✅ Try one of: {', '.join(supported_langs)}"
        )

    print(f"🔄 Translating to: {target_lang.upper()} using HuggingFace model...")

    #Step 2: Load the model/tokenizer
    tokenizer, model = load_translation_model("en", target_lang)

    #Step 3: Extract text from file
    if file_type == "pdf":
        text = extract_text_from_pdf(file_path)
    elif file_type == "docx":
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("❌ Unsupported file type! Use 'pdf' or 'docx'.")

    #Step 4: Translate line-by-line
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    translated = [translate_text(line, tokenizer, model) for line in lines]
    full_translated_text = "\n".join(translated)

    #Step 5: Save to translated output
    output_path = file_path.replace(".pdf", f"_{target_lang}.pdf") if file_type == "pdf" else file_path.replace(".docx", f"_{target_lang}.docx")

    if file_type == "pdf":
        save_to_pdf(full_translated_text, output_path)
    else:
        save_to_docx(full_translated_text, output_path)

    print(f"✅ Translated content saved to: {output_path}")

In [13]:
# File to translate
file_path = r"C:\Users\Hxtreme\Jupyter_Notebook_Learning\Final_Project\Dataset\Aspire_Policy_Wordings.pdf"
file_type = "pdf"

#Language selection logic
language_options = {
    "Hindi": "hi",
    "French": "fr",
    "Spanish": "es",
    "German": "de",
    "Chinese": "zh"
}
selected_language = "French"  # Change here
target_lang = language_options[selected_language]

# Translate and save
translate_and_save(file_path, file_type, target_lang)


🔄 Translating to: FR using HuggingFace model...
✅ Translated content saved to: C:\Users\Hxtreme\Jupyter_Notebook_Learning\Final_Project\Dataset\Aspire_Policy_Wordings_fr.pdf


In [15]:
#!pip install pdfplumber

Defaulting to user installation because normal site-packages is not writeable
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
     ---------------------------------------- 0.0/42.8 kB ? eta -:--:--
     --------- ------------------------------ 10.2/42.8 kB ? eta -:--:--
     ------------------ ------------------- 20.5/42.8 kB 217.9 kB/s eta 0:00:01
     -------------------------------------- 42.8/42.8 kB 260.1 kB/s eta 0:00:00
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
     ---------------------------------------- 0.0/48.2 kB ? eta -:--:--
     ---------------------------------------- 48.2/48.2 kB 1.2 MB/s eta 0:00:00
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
   ---------------------------------------- 0.0/60.2 kB ? eta -:--:--
   --------



In [16]:
#!pip install googletrans==4.0.0-rc1

Defaulting to user installation because normal site-packages is not writeable
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Down

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.75 requires requests_mock, which is not installed.
jupyterlab 4.3.6 requires httpx>=0.25.0, but you have httpx 0.13.3 which is incompatible.
conda-repo-cli 1.0.75 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.75 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.
notebook 7.0.8 requires jupyterlab<4.1,>=4.0.2, but you have jupyterlab 4.3.6 which is incompatible.


In [19]:
!pip install pdfplumber python-docx deep-translator

Defaulting to user installation because normal site-packages is not writeable
Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
   ---------------------------------------- 0.0/42.3 kB ? eta -:--:--
   --------- ------------------------------ 10.2/42.3 kB ? eta -:--:--
   ---------------------------------------- 42.3/42.3 kB 681.8 kB/s eta 0:00:00
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4




In [23]:
!pip install pdfplumber python-docx deep-translator

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import os
import pdfplumber
import docx
import time
from deep_translator import GoogleTranslator

# --------- Extract Text ---------
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

# --------- Safe Translation ---------
def translate_line_safe(line, src='en', dest='hi', retries=3, delay=1):
    for attempt in range(retries):
        try:
            translator = GoogleTranslator(source=src, target=dest, timeout=5)
            return translator.translate(line)
        except Exception as e:
            print(f"⚠️ Retry {attempt+1} failed for line: {line[:30]}... ➜ {e}")
            time.sleep(delay)
    return line  # fallback to original

# --------- Translate Full File ---------
def translate_file(input_path, output_path, src_lang='en', dest_lang='hi'):
    ext = os.path.splitext(input_path)[1].lower()
    if ext == ".pdf":
        text = extract_text_from_pdf(input_path)
    elif ext == ".docx":
        text = extract_text_from_docx(input_path)
    else:
        raise ValueError("❌ Unsupported file format. Use PDF or DOCX.")

    print(f"🔄 Translating from {src_lang} ➜ {dest_lang}")
    
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    translated_lines = [translate_line_safe(line, src_lang, dest_lang) for line in lines]
    translated_text = "\n\n".join(translated_lines)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(translated_text)

    print(f"\n✅ Translated file saved to: {output_path}")

# --------- Example Usage ---------
translate_file(
    input_path=r"C:\Users\Hxtreme\Jupyter_Notebook_Learning\Final_Project\Dataset\Aspire_Policy_Wordings.pdf",
    output_path=r"C:\Users\Hxtreme\Jupyter_Notebook_Learning\Final_Project\Dataset\Aspire_Policy_Wordings_hi_v2.txt",
    src_lang="en",
    dest_lang="hi"  # you can also use "ta", "fr", "de", etc.
)


🔄 Translating from en ➜ hi

✅ Translated file saved to: C:\Users\Hxtreme\Jupyter_Notebook_Learning\Final_Project\Dataset\Aspire_Policy_Wordings_hi_v2.txt
