### Loading Bhasha-Abhijnaanam Datasets

In [1]:
import os, zipfile, requests

urls = {
    "native": "https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/native_script_train_valid_data.zip",
    "roman": "https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/roman_script_train_valid_data.zip",
    "native-roman": "https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/parallel_romanized_train_data.zip",
    "abhijnaanam": "https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/bhasha-abhijnaanam_test_set.zip"
}

os.makedirs("ba_training", exist_ok=True)

for name, url in urls.items():
    zip_path = f"ba_training/{name}.zip"
    if not os.path.exists(zip_path):
        print(f"Downloading {name} dataset...")
        r = requests.get(url)
        with open(zip_path, "wb") as f:
            f.write(r.content)
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(f"ba_training/")
        os.remove(zip_path)
        print("Zip file removed.")
os.listdir("ba_training/")

Downloading native dataset...
Zip file removed.
Downloading roman dataset...
Zip file removed.
Downloading native-roman dataset...
Zip file removed.
Downloading abhijnaanam dataset...
Zip file removed.


['parallel_romanized_train_data.json',
 'Native_script_data',
 'Roman_script_data',
 'bhasha-abhijnaanam.json']

In [2]:
import pandas as pd

def load_and_sample(path, frac=1, seed=42):
    """Load the text file, split label and text, and sample a portion of data."""
    data = []
    with open(path, 'r') as f:
        for line in f:
            parts = line.strip().split(maxsplit=1)  
            if len(parts) == 2:
                label, text = parts
                label = label.replace("__label__","")
                data.append([label, text])  

    df = pd.DataFrame(data, columns=["label", "text"])
    return df

roman_train = load_and_sample("ba_training/Roman_script_data/train_combine.txt")
roman_valid = load_and_sample("ba_training/Roman_script_data/valid_combine.txt")
roman = pd.concat([roman_train, roman_valid], axis=0, ignore_index=True)
native_train = load_and_sample("ba_training/Native_script_data/train_combine.txt")
native_valid = load_and_sample("ba_training/Native_script_data/valid_combine.txt")
native = pd.concat([native_train, native_valid], axis=0, ignore_index=True)

In [3]:
import json

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        obj = json.load(f)   # one JSON object
    data = obj["data"]
    df = pd.DataFrame([{
        "id": d["unique_identifier"],
        "native": d["native sentence"],
        "roman": d["romanized sentence"],
        "label": d["language"],
        "script": d["script"],
        "source": d["source"]
    } for d in data])

    return df

# usage
benchmark = load_json("ba_training/bhasha-abhijnaanam.json")
roman_native = load_json("ba_training/parallel_romanized_train_data.json")

In [4]:
benchmark_native = benchmark[['native', 'label']].rename(columns={'native': 'text'})
benchmark_roman = benchmark[['roman', 'label']].rename(columns={'roman': 'text'})
benchmark_roman = benchmark_roman[benchmark_roman['text'] != '']
benchmark = pd.concat([benchmark_native, benchmark_roman], axis=0, ignore_index=True)
benchmark.to_csv('bhasha-abhijnaanam.csv', index=False)

In [5]:
benchmark_native.to_csv('native.csv', index=False)
benchmark_roman.to_csv('roman.csv', index=False)

In [6]:
native = native[native['label'].isin(['Dogri', 'Santali', 'Manipuri_Beng', 'Manipuri_Mei'])]
native = native.rename(columns = {'text': 'native'})
native.label.value_counts()     # adding these classes to solve class imbalance and add missing classes

label
Manipuri_Beng    100997
Manipuri_Mei     100500
Santali          100345
Dogri            100120
Name: count, dtype: int64

### Using custom transliteration(Aksharmukha + Character mapping) for creating roman scripts

In [7]:
from aksharamukha.transliterate import process
import re
import unidecode

In [8]:
santhali_map = {
    '᱐': '0', '᱑': '1', '᱒': '2', '᱓': '3', '᱔': '4', '᱕': '5', '᱖': '6', '᱗': '7', '᱘': '8', '᱙': '9',
    'ᱚ': 'a', 'ᱛ': 'b', 'ᱜ': 'c', 'ᱝ': 'd', 'ᱞ': 'e', 'ᱟ': 'f', 'ᱠ': 'g', 'ᱡ': 'h',
    'ᱢ': 'i', 'ᱣ': 'j', 'ᱤ': 'k', 'ᱥ': 'l', 'ᱦ': 'm', 'ᱧ': 'n', 'ᱨ': 'o', 'ᱩ': 'p',
    'ᱪ': 'q', 'ᱫ': 'r', 'ᱬ': 's', 'ᱭ': 't', 'ᱮ': 'u', 'ᱯ': 'v', 'ᱰ': 'w', 'ᱱ': 'x',
    'ᱲ': 'y', 'ᱳ': 'z', 'ᱴ': 'ṭ', 'ᱵ': 'ḍ', 'ᱶ': 'ṅ', 'ᱷ': 'ṭh', 'ᱸ': 'ḍh', 'ᱹ': 'ñ',
    'ᱺ': 'ŋ', 'ᱻ': 'ś', 'ᱼ': 'ṣ', 'ᱽ': 'ḷ'
}

language_to_script = {
    'Dogri': 'Devanagari',
    'Manipuri_Beng': 'Bengali',
    'Manipuri_Mei': 'MeeteiMayek'
}
custom_mappings = {
    'Santali': santhali_map,
}

In [9]:
def transliterate_row(row):
    lang = row['label']
    sentence = str(row['native'])
    
    # custom mapping first
    if lang in custom_mappings:
        mapping = custom_mappings[lang]
        return ''.join([mapping.get(c, c) for c in sentence])
    
    # Aksharamukha for others
    source_script = language_to_script.get(lang)
    if not source_script:
        return '[Unknown language]'
    
    try:
        # Step 1: IAST transliteration
        iast = process(source_script, "IAST", sentence)
        
        # Step 2: Convert diacritics → plain ASCII
        ascii_out = unidecode.unidecode(iast)
        return ascii_out
    
    except Exception as e:
        return f"[Error: {e}]"

In [10]:
native['roman'] = native.apply(transliterate_row, axis=1)
native

Unnamed: 0,label,native,roman
3,Manipuri_Mei,ꯍꯤꯕꯤ ꯅꯠꯇ꯭ꯔꯒ ꯍꯦꯕꯦ ꯑꯁꯤ ꯒ꯭ꯔꯤꯛꯀꯤ ꯃꯤ ꯂꯥꯢ ꯇꯤꯟꯒꯤ ꯋꯥꯔꯤ...,hibi nattraga hebe asi grikki mi lay tingi var...
9,Manipuri_Beng,মতম অদুদা লৈরম্বা য়ুম্নাক ৩৬১ অদুদা চপ মান্নন...,matama aduda lairamba yumnaka 361 aduda capa m...
14,Manipuri_Beng,খূদম ওইনা নুপাগী সেক্স ওর্গানদগী অঙাং ওইহনবদা ...,khudama oina nupagi seksa organadagi anam oiha...
30,Manipuri_Mei,ꯀꯣꯚꯦꯂꯦꯟꯠ ꯂꯤꯄꯨꯟ ꯅꯠꯇ꯭ꯔꯒ ꯃꯣꯂꯦꯀꯨꯂꯔ ꯂꯤꯄꯨꯟ ( ꯏꯪꯂꯤꯁ :...,kobhelent lipun nattraga molekulara lipun ( im...
37,Dogri,जे दक्खना च,je dakkhana ca
...,...,...,...
2710676,Manipuri_Beng,1965কী মার্চ 18দা মহাক্না স্পেসক্রাফকী মপান্দা...,1965ki marca 18da mahakna spesakraphaki mapand...
2710714,Manipuri_Mei,ꯏꯪ ꯱꯹꯹꯹ ꯗ ꯕꯣꯕꯤꯅ ꯃꯍꯥꯛꯀꯤ ꯃꯌꯥꯝꯕ ꯁꯅꯤ ꯗꯤꯑꯣꯜꯅ ꯂꯝꯖꯤꯡ...,im 1999 da bobina mahakki mayamba sani diolna...
2710805,Manipuri_Mei,ꯚꯥꯔꯇꯅꯥ ꯏꯪ ꯲꯰꯰꯱ꯗꯥ ꯈꯥ ꯑꯐ꯭ꯔꯤꯀꯥꯗꯥ ꯆꯠꯈꯤꯕꯥ ꯈꯣꯡꯆꯠ ꯑꯗꯨ...,bharatana im 2001da kha aphrikada catkhiba kho...
2710849,Manipuri_Mei,ꯃꯥꯂꯦꯝꯒꯤ ꯊꯥꯛꯇ ꯃꯃꯤꯡ ꯆꯠꯂꯕ ꯃꯤꯇꯝ ꯁꯥꯕ ꯃꯤꯑꯣꯏ ꯁꯨꯁꯤꯜ ꯁꯈ...,malemgi thakta mamin catlaba mitam saba mioi s...


In [11]:
native.label.value_counts()

label
Manipuri_Beng    100997
Manipuri_Mei     100500
Santali          100345
Dogri            100120
Name: count, dtype: int64

In [12]:
manipuri_beng = native[native['label'] == 'Manipuri_Beng']
manipuri_mei = native[native['label'] == 'Manipuri_Mei']

n_each = 100_500 // 2  # 50,250 from each script

manipuri_beng_sampled = manipuri_beng.sample(n=n_each, random_state=42)
manipuri_mei_sampled = manipuri_mei.sample(n=n_each, random_state=42)
manipuri_balanced = pd.concat([manipuri_beng_sampled, manipuri_mei_sampled], ignore_index=True)
manipuri_balanced['label'] = 'Manipuri'
native_rest = native[~native['label'].isin(['Manipuri_Beng', 'Manipuri_Mei'])]
native_final = pd.concat([native_rest, manipuri_balanced], ignore_index=True)
native_final.label.value_counts()

label
Manipuri    100500
Santali     100345
Dogri       100120
Name: count, dtype: int64

### Merging datasets to solve class imbalance

In [13]:
merged = pd.concat([native_final, roman_native], ignore_index=True)
merged.label.value_counts()

label
Telugu       299033
Gujarati     298984
Bangla       298926
Malayalam    298812
Marathi      298795
Hindi        298612
Oriya        296294
Tamil        292926
Kannada      289949
Assamese     279066
Punjabi      235848
Nepali       234596
Sanskrit     201462
Maithili     156921
Sindhi       150751
Manipuri     131028
Bodo         114102
Konkani      110001
Urdu         105704
Kashmiri     105654
Santali      100345
Dogri        100120
Name: count, dtype: int64

### Creating triplets

In [19]:
sampled_dfs = []

for lang, group in merged.groupby("label"): 
    sampled = group.sample(n=100120, random_state=42)
    sampled_dfs.append(sampled)

balanced_pair = pd.concat(sampled_dfs, ignore_index=True)
print("Samples per language (after undersampling):")
print(balanced_pair["label"].value_counts())

# Calculate size in bytes → MB / GB
df_size_bytes = balanced_pair.memory_usage(deep=True).sum()
df_size_mb = df_size_bytes / (1024 ** 2)
df_size_gb = df_size_bytes / (1024 ** 3)

print(f"\nTotal samples: {len(balanced_pair):,}")
print(f"Languages included: {balanced_pair['label'].nunique()}")
print(f"DataFrame size: {df_size_mb:.2f} MB ({df_size_gb:.3f} GB)")

Samples per language (after undersampling):
label
Assamese     100120
Bangla       100120
Bodo         100120
Dogri        100120
Gujarati     100120
Hindi        100120
Kannada      100120
Kashmiri     100120
Konkani      100120
Maithili     100120
Malayalam    100120
Manipuri     100120
Marathi      100120
Nepali       100120
Oriya        100120
Punjabi      100120
Sanskrit     100120
Santali      100120
Sindhi       100120
Tamil        100120
Telugu       100120
Urdu         100120
Name: count, dtype: int64

Total samples: 2,202,640
Languages included: 22
DataFrame size: 1386.63 MB (1.354 GB)


In [20]:
balanced_pair.drop(['id', 'script', 'source'], axis=1, inplace=True)
balanced_pair

Unnamed: 0,label,native,roman
0,Assamese,আনফালে আৰক্ষীৰ অভিযোগ একাংশ লোকে পৰিকল্পিত ভাৱ...,unfale aarokhyir obhijug ekangxo luke porikolp...
1,Assamese,"যেনেদৰে শাৰিৰীক দুখ , কষ্ট সহ্য কৰিবলগীয়া হয়...","jenedore xaririk dukh , kosto xohyo koribologi..."
2,Assamese,প্ৰবায়োটিকছ কেপছুলো বজাৰত পোৱা যায় ।,probayutiks capsulu bojarot puwa yaay .
3,Assamese,"জানিব পৰা মতে , মটৰচাইকেল আৰোহীজন বকো ২নং ছেখা...","janibo poraa mote , motorsaikel aaruhijon boku..."
4,Assamese,অন্যথা অনাগত দিনত ইয়াতকৈ জংগী আন্দোলনৰ কাৰ্যস...,onyotha onagoto dinot iyatkoi jongi andulonor ...
...,...,...,...
2202635,Urdu,آپ سوچ رہیں ہوں گیں کہ عمران خان ابھی تک کیا ک...,aap soch rahein hoan gain kahh amraan khaan ub...
2202636,Urdu,انگلینڈ جرنل آف میڈیسن میں شائع ہونے ہونے وال...,england journal aff medison mein shaye honay h...
2202637,Urdu,خزاں میں تم کو خرید لیں گے,khazaan mein tamm kuu khareed lain gay
2202638,Urdu,سردار گل محمد خان جوگیزئی ( پیدائش : ، ضلع لور...,sardaar gull mohammad khaan jogeezi ( paidaish...


In [21]:
import numpy as np

df = balanced_pair.copy()

all_labels = df['label'].unique().tolist()
n_classes = len(all_labels)

other_class_sentences = {
    label: df[df['label'] != label]['roman'].reset_index(drop=True)
    for label in all_labels
}

sentences_by_class = {
    label: df[df['label'] == label]['roman'].reset_index(drop=True)
    for label in all_labels
}


def sample_negative(curr_label, n_samples):
    """
    Return n_samples negatives for curr_label using stratified sampling
    across the remaining 21 classes.
    """
    negative_classes = [c for c in all_labels if c != curr_label]

    per_class = n_samples // len(negative_classes)
    remainder = n_samples % len(negative_classes)
    neg_list = []

    for cls in negative_classes:
        neg = sentences_by_class[cls].sample(
            per_class, replace=True, random_state=None
        )
        neg_list.append(neg)

    if remainder > 0:
        extra_classes = np.random.choice(negative_classes, remainder, replace=True)
        for cls in extra_classes:
            neg_extra = sentences_by_class[cls].sample(1, replace=True, random_state=None)
            neg_list.append(neg_extra)

    return pd.concat(neg_list, ignore_index=True)


triplets = []

for label in all_labels:
    group = df[df['label'] == label]
    n = len(group)
    negatives = sample_negative(label, n)

    out = pd.DataFrame({
        'anchor': group['native'].values,
        'positive': group['roman'].values,
        'negative': negatives.values,
        'label': group['label'].values  # (optional)
    })

    triplets.append(out)

triplet_df = pd.concat(triplets, ignore_index=True)
triplet_df

Unnamed: 0,anchor,positive,negative,label
0,আনফালে আৰক্ষীৰ অভিযোগ একাংশ লোকে পৰিকল্পিত ভাৱ...,unfale aarokhyir obhijug ekangxo luke porikolp...,ekoi soor worled phedareshana obo public healt...,Assamese
1,"যেনেদৰে শাৰিৰীক দুখ , কষ্ট সহ্য কৰিবলগীয়া হয়...","jenedore xaririk dukh , kosto xohyo koribologi...",money money bollol aritra .,Assamese
2,প্ৰবায়োটিকছ কেপছুলো বজাৰত পোৱা যায় ।,probayutiks capsulu bojarot puwa yaay .,bachchader jaanaa uchito jey sadharan melamesh...,Assamese
3,"জানিব পৰা মতে , মটৰচাইকেল আৰোহীজন বকো ২নং ছেখা...","janibo poraa mote , motorsaikel aaruhijon boku...",unnoyoner aggrogoti ekhono jaander kachhe pouc...,Assamese
4,অন্যথা অনাগত দিনত ইয়াতকৈ জংগী আন্দোলনৰ কাৰ্যস...,onyotha onagoto dinot iyatkoi jongi andulonor ...,"taanr maranottar "" parris inn dya tuentieth se...",Assamese
...,...,...,...,...
2202635,آپ سوچ رہیں ہوں گیں کہ عمران خان ابھی تک کیا ک...,aap soch rahein hoan gain kahh amraan khaan ub...,kammartod monghvarina yugmaa garib tathaa jaru...,Urdu
2202636,انگلینڈ جرنل آف میڈیسن میں شائع ہونے ہونے وال...,england journal aff medison mein shaye honay h...,ima rasigi punsida khbaidagi mayapakapa puknim...,Urdu
2202637,خزاں میں تم کو خرید لیں گے,khazaan mein tamm kuu khareed lain gay,yadyad bhavyam bhavatu bhagavan purvakarmaanur...,Urdu
2202638,سردار گل محمد خان جوگیزئی ( پیدائش : ، ضلع لور...,sardaar gull mohammad khaan jogeezi ( paidaish...,stadium aakar mein andakar haye aur isamen ter...,Urdu


In [17]:
triplet_df.to_csv('phase1.csv', index=False)

In [23]:
triplet_df[triplet_df['label'] == 'Bangla']

Unnamed: 0,anchor,positive,negative,label
100120,নাজিল হওয়ার পরিপ্রেক্ষিত .,nazil howar poriprekkhito .,nogaoh xaxokiyo bijepi dolor bhatri saatro xon...,Bangla
100121,নাম না করে কটাক্ষ করেছেন বাম - কংগ্রেসকেও ।,naam naa korey kotakkho korechen baam - congre...,ejon byoktir xoriror ujon borhaboloi xukhadyo ...,Bangla
100122,সে ক্ষেত্রে গ্যাসের বিকল্প হিসেবে যদি সরকার আম...,seey khettre gasser bikalpa hisebe jodi sarkar...,khamothu hoba laage .,Bangla
100123,আর জলবায়ু হলো কোনো নির্দিষ্ট স্থানের দীর্ঘ সম...,arr jalbayu hollo cono nirdishto sthaner deerg...,pisot teu jodiu ekhon bixal xamrajyor odhikari...,Bangla
100124,মোটামুটি হিসাব অনুযায়ী শহরাঞ্চলে প্রায় ৩ কোট...,motamuti hisab anujayi shaharanchale pray 3 ko...,xongkotjonok owosthat thoka aghatprapto ejonok...,Bangla
...,...,...,...,...
200235,ঢাকা : আটটি সাধারণ শিক্ষা বোর্ড এবং মাদ্রাসা ও...,ddhaka : atti sadharan shikkha board ebong mad...,laal kille dey nerhe yamuna nadee dey tatt ' t...,Bangla
200236,ইকোস্যানকে রিসোর্স - ওরিয়েন্টেড স্যনিটেশনও ( ...,ecosanke resource - oriented sanitationo ( res...,kudimakkal thodarpaana nadaimuraigalil myyamaa...,Bangla
200237,"তার অংশগ্রহণ ছিলো মুক্তিযুদ্ধ , বাষট্টির ছাত্র...","taar ongshogrohon chhilo muktijuddho , bashott...",apane nirana laine de prakriya ca apanai gei v...,Bangla
200238,এটি দ্বিতীয় শতাব্দীর জ্যোতির্বিদ টলেমি দ্বারা...,etty dwitiya shatabdir jyotirbid tolemi dwara ...,"1 - bavajud keh, herr chand, goke.",Bangla


### Creating sentence label paired dataset

In [25]:
sampled_dfs = []

for lang, group in merged.groupby("label"): 
    sampled = group.sample(n=21454, random_state=24)
    sampled_dfs.append(sampled)

balanced = pd.concat(sampled_dfs, ignore_index=True)
print("Samples per language (after undersampling):")
print(balanced["label"].value_counts())

Samples per language (after undersampling):
label
Assamese     21454
Bangla       21454
Bodo         21454
Dogri        21454
Gujarati     21454
Hindi        21454
Kannada      21454
Kashmiri     21454
Konkani      21454
Maithili     21454
Malayalam    21454
Manipuri     21454
Marathi      21454
Nepali       21454
Oriya        21454
Punjabi      21454
Sanskrit     21454
Santali      21454
Sindhi       21454
Tamil        21454
Telugu       21454
Urdu         21454
Name: count, dtype: int64


In [27]:
balanced.drop(['id', 'script', 'source'], axis=1, inplace=True)
phase2_native = balanced[['native','label']].rename(columns={'native':'text'})
phase2_roman = balanced[['roman','label']].rename(columns={'roman':'text'})
phase2 = pd.concat([phase2_native, phase2_roman], axis=0, ignore_index=True)
phase2

Unnamed: 0,text,label
0,গুৱাহাটী উচ্চ ন্যায়ালয় অসমৰ সৰ্বোচ্চ ন্যায়া...,Assamese
1,যিবোৰ কাৰখানাৰ মেচিন আৰু তালৈ অহা যোৱা শ শ ডাম...,Assamese
2,ই অতি বিশাল ৰূপ ধাৰণ কৰে ।,Assamese
3,১৯৬৩ চনৰ নাজিৰা অধিৱেশনৰ অসম সাহিত্য সভাৰ সভাপ...,Assamese
4,"সৰু লৰা - ছোৱালী , ১২ বছৰৰ তলৰ হ ' লে , তুলসীৰ...",Assamese
...,...,...
943971,( maktubaat amaam rabanio; jeej 2 sas 225 ),Urdu
943972,karaachi mein korangi aur old stee airiya mein...,Urdu
943973,elawah azein iss say qabal bhee rawaan saal ag...,Urdu
943974,kaazimi saahib kay kees say inn kuu pehchaan m...,Urdu


In [28]:
# Calculate size in bytes → MB / GB
df_size_bytes = phase2.memory_usage(deep=True).sum()
df_size_mb = df_size_bytes / (1024 ** 2)
df_size_gb = df_size_bytes / (1024 ** 3)

print(f"\nTotal samples: {len(phase2):,}")
print(f"Languages included: {phase2['label'].nunique()}")
print(f"DataFrame size: {df_size_mb:.2f} MB ({df_size_gb:.3f} GB)")


Total samples: 943,976
Languages included: 22
DataFrame size: 248.47 MB (0.243 GB)


In [29]:
phase2.to_csv('phase2.csv', index=False)