In [None]:
!pip install indic-transliteration deep-translator pandas

Collecting indic-transliteration
  Downloading indic_transliteration-2.3.78-py3-none-any.whl.metadata (1.6 kB)
Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting backports.functools_lru_cache (from indic-transliteration)
  Downloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting roman (from indic-transliteration)
  Downloading roman-5.2-py3-none-any.whl.metadata (4.3 kB)
Downloading indic_transliteration-2.3.78-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl (6.7 kB)
Downloading roman-5.2-py3-none-any.whl (6.0 kB)
Installing collected pac

In [None]:
import pandas as pd
import json
import time
from deep_translator import GoogleTranslator
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

In [None]:
# 1. Load your existing 10k JSON
try:
    df = pd.read_json("nepali_alpaca_10k.json")
    # Shuffle randomly
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    print("Dataset loaded and shuffled.")
except Exception as e:
    print(f"Error loading file: {e}")

Dataset loaded and shuffled.


In [None]:
# 2. Split into two halves
df_roman_only = df.iloc[:5000].copy()
df_mixed_en = df.iloc[5000:].copy()

In [None]:
# Helper function for Romanization (using ITRANS scheme)
def to_roman(text):
    if not text or str(text).strip() == "":
        return ""
    try:
        # Transliterate Devanagari to ITRANS
        roman_text = transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
        # Clean up: ITRANS uses caps for long vowels; .lower() makes it look like 'Chat' Nepali
        return roman_text.lower().replace(".", "").strip()
    except:
        return text

In [None]:
# --- Process Group 1: 5k Full Roman Nepali ---
print("Group 1: Converting all fields to Roman...")
for col in ['instruction', 'input', 'output']:
    df_roman_only[col] = df_roman_only[col].apply(to_roman)

Group 1: Converting all fields to Roman...


In [None]:
# --- Process Group 2: 5k English Instruction + Roman Output ---
print("Group 2: Translating instructions to English (API process)...")
translator = GoogleTranslator(source='ne', target='en')

Group 2: Translating instructions to English (API process)...


In [None]:
# Romanize Input and Output first
for col in ['input', 'output']:
    df_mixed_en[col] = df_mixed_en[col].apply(to_roman)

In [None]:
# Translate Instructions to English
instructions_en = []
for i, text in enumerate(df_mixed_en['instruction']):
    try:
        translated = translator.translate(text)
        instructions_en.append(translated)

        if (i + 1) % 100 == 0:
            print(f"Translated {i + 1} / 5000...")

        time.sleep(0.15) # Safety delay for Google API
    except:
        instructions_en.append(text)

df_mixed_en['instruction'] = instructions_en

Translated 100 / 5000...
Translated 200 / 5000...
Translated 300 / 5000...
Translated 400 / 5000...
Translated 500 / 5000...
Translated 600 / 5000...
Translated 700 / 5000...
Translated 800 / 5000...
Translated 900 / 5000...
Translated 1000 / 5000...
Translated 1100 / 5000...
Translated 1200 / 5000...
Translated 1300 / 5000...
Translated 1400 / 5000...
Translated 1500 / 5000...
Translated 1600 / 5000...
Translated 1700 / 5000...
Translated 1800 / 5000...
Translated 1900 / 5000...
Translated 2000 / 5000...
Translated 2100 / 5000...
Translated 2200 / 5000...
Translated 2300 / 5000...
Translated 2400 / 5000...
Translated 2500 / 5000...
Translated 2600 / 5000...
Translated 2700 / 5000...
Translated 2800 / 5000...
Translated 2900 / 5000...
Translated 3000 / 5000...
Translated 3100 / 5000...
Translated 3200 / 5000...
Translated 3300 / 5000...
Translated 3400 / 5000...
Translated 3500 / 5000...
Translated 3600 / 5000...
Translated 3700 / 5000...
Translated 3800 / 5000...
Translated 3900 / 500

In [None]:
# 3. Combine and Shuffle
df_final = pd.concat([df_roman_only, df_mixed_en]).sample(frac=1).reset_index(drop=True)

In [None]:
# 4. Save Final Dataset
output_filename = "final_cross_lingual_indic_10k.json"
df_final.to_json(output_filename, orient="records", force_ascii=False, indent=4)

In [None]:
print(f"\nDone! Saved to: {output_filename}")

# Final check
print("\n--- Example Record ---")
print(df_final.iloc[0].to_dict())


Done! Saved to: final_cross_lingual_indic_10k.json

--- Example Record ---
{'instruction': "Describe Amazon's company culture", 'input': '', 'output': 'amazon ko kampani samskrriti grahaka jununa, navinata, uchcha pradarshana, ra svamitva ma kendrita cha| tiniharu drridhatapurvaka grahaka-pahilo manovrrittima vishvasa garchan ra uniharuko kamama agragami ra navina banna prayasa garchan, sathai svamitvako mulya~nkana gardai ra pahala liiraheka chan| amejanale baliyo toliharu nirmana garna ra sahakarya ra nirantara sikailai protsahana garne vatavarana sirjana garne prayasa garcha|'}


In [None]:
# Final check
print("\n--- Example Record ---")
print(df_final.iloc[0].to_dict())


--- Example Record ---
{'instruction': "Describe Amazon's company culture", 'input': '', 'output': 'amazon ko kampani samskrriti grahaka jununa, navinata, uchcha pradarshana, ra svamitva ma kendrita cha| tiniharu drridhatapurvaka grahaka-pahilo manovrrittima vishvasa garchan ra uniharuko kamama agragami ra navina banna prayasa garchan, sathai svamitvako mulya~nkana gardai ra pahala liiraheka chan| amejanale baliyo toliharu nirmana garna ra sahakarya ra nirantara sikailai protsahana garne vatavarana sirjana garne prayasa garcha|'}
