In [2]:
import json

# Step 1: Load the JSON file
with open("Dataset/flores_en_mni_dev.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Step 2: Extract source and target sentences
examples = data["examples"]
sources = [ex["source"] for ex in examples]
targets = [ex["target"] for ex in examples]

# Step 3: Save to separate files
with open("source_sentences.txt", "w", encoding="utf-8") as src_file:
    src_file.write("\n".join(sources))

with open("target_sentences.txt", "w", encoding="utf-8") as tgt_file:
    tgt_file.write("\n".join(targets))


In [3]:
len(sources)

997

In [4]:
sources[0]

'Although most agencies are willing to take on most regular bookings, many agents specialise in particular types of travel, budget ranges or destinations.'

In [5]:
len(targets)

997

In [6]:
targets[0]

'অদুমওইনমক, অয়াম্বা এজেন্সীশিংনা মহৌশাগী বুকিংগুম্না অদুমক লৌনবা হোৎনৈ, এজেন্ত কয়ানা ত্রাভেল, বজেৎকী রেঞ্জ নৎত্রগা লমথুংফমগী অখন্নবা মওংদা হেন্না খুৎলোইবসু য়াওই।'

# Transliteration from Bengali target script to Meitei Mayek script

In [7]:
def load_mapping(file1, file2):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        keys = [line.strip() for line in f1]
        values = [line.strip() for line in f2]
    return dict(zip(keys, values))

def transliterate_syllable_aware(line):
    result = ""
    i = 0
    while i < len(line):
        # Handle conjunct consonant: C + ্ + C
        if (i + 2 < len(line)) and (line[i + 1] == hasanta and line[i] in lonsum_map and line[i + 2] in mapi_map):
            result += lonsum_map[line[i]] + mapi_map[line[i + 2]]
            i += 3
            continue

        ch = line[i]

        # Vowel handling
        if ch in vowel_map:
            # Check next char for vowel (i.e., consecutive vowels)
            if i + 1 < len(line) and line[i + 1] in vowel_map:
                result += lonsum_map.get(ch, vowel_map[ch])  # use lonsum if exists
            else:
                result += vowel_map[ch]
            i += 1
            continue

        # Full consonant
        if ch in mapi_map:
            result += mapi_map[ch]
        # Half consonant
        elif ch in lonsum_map:
            result += lonsum_map[ch]
        # Space or unknown
        else:
            result += ch

        i += 1

    return result

# Load all mappings
mapi_map = load_mapping("complete_bengali.txt", "mapi_mayek.txt")
lonsum_map = load_mapping("half_consonant.txt", "lonsum.txt")
vowel_map = load_mapping("vowel.txt", "cheitap.txt")

hasanta = '\u09CD'

targets_mni = [transliterate_syllable_aware(line) for line in targets]

In [8]:
apun_iyek = '\uABED'
print("Apun Iyek symbol is:", apun_iyek)


Apun Iyek symbol is: ꯭


In [9]:
sources[4]

"A triceratops' teeth would have been able to crush not only leaves but even very tough branches and roots."

In [10]:
targets[4]

'ত্রাইসেরাতোপ অমগী ময়ানা উনাশিংদা নত্তনা য়াম্না কনবা উশাশিং অমসুং উরাশিংসু তকখাইবা ঙম্লম্মী।'

In [11]:
targets_mni[4]

'ꯇ্ꯔꯥꯢꯁꯦꯔꯥꯇꯣꯄ ꯑꯃꯒꯤ ꯃꯌꯥꯅꯥ ꯎꯅꯥꯁꯤꯪꯗꯥ ꯅꯇ্ꯇꯅꯥ ꯌꯥꯃ্ꯅꯥ ꯀꯅꯕꯥ ꯎꯁꯥꯁꯤꯪ ꯑꯃꯁꯨꯪ ꯎꯔꯥꯁꯤꯪꯁꯨ ꯇꯀꯈꯥꯢꯕꯥ ꯉꯃ্ꯂꯃ্ꯃꯤ꯫'

In [12]:
len(targets_mni)

997

# Edit manually all the incorrect transliterated words.

### Load back to python after editing

In [13]:
with open("Transliterated_texts.txt", "r", encoding="utf-8") as f:
    edited_targets_mni = [line.strip() for line in f.readlines()]

In [14]:
sources[0]

'Although most agencies are willing to take on most regular bookings, many agents specialise in particular types of travel, budget ranges or destinations.'

In [15]:
targets[0]

'অদুমওইনমক, অয়াম্বা এজেন্সীশিংনা মহৌশাগী বুকিংগুম্না অদুমক লৌনবা হোৎনৈ, এজেন্ত কয়ানা ত্রাভেল, বজেৎকী রেঞ্জ নৎত্রগা লমথুংফমগী অখন্নবা মওংদা হেন্না খুৎলোইবসু য়াওই।'

In [16]:
targets_mni[0]

'ꯑꯗꯨꯃꯑꯣꯢꯅꯃꯀ, ꯑꯌꯥꯃ্ꯕꯥ এꯖꯦꯅ্ꯁꯤꯁꯤꯪꯅꯥ ꯃꯍꯧꯁꯥꯒꯤ ꯕꯨꯀꯤꯪꯒꯨꯃ্ꯅꯥ ꯑꯗꯨꯃꯀ ꯂꯧꯅꯕꯥ ꯍꯣꯠꯅꯩ, এꯖꯦꯅ্ꯇ ꯀꯌꯥꯅꯥ ꯇ্ꯔꯥꯚꯦꯂ, ꯕꯖꯦꯠꯀꯤ ꯔꯦঞ্ꯖ ꯅꯠꯇ্ꯔꯒꯥ ꯂꯃꯊꯨꯪꯐꯃꯒꯤ ꯑꯈꯅ্ꯅꯕꯥ ꯃꯑꯣꯪꯗꯥ ꯍꯦꯅ্ꯅꯥ ꯈꯨꯠꯂꯣꯢꯕꯁꯨ ꯌꯥꯑꯣꯢ꯫'

In [17]:
edited_targets_mni[0]

'ꯑꯗꯨꯝꯑꯣꯢꯅꯃꯛ, ꯑꯌꯥꯝꯕ ꯑꯦꯖꯦꯟꯁꯤꯁꯤꯡꯅ ꯃꯍꯧꯁꯥꯒꯤ ꯕꯨꯀꯤꯡꯒꯨꯝꯅ ꯑꯗꯨꯃꯛ ꯂꯧꯅꯕꯥ ꯍꯣꯠꯅꯩ, ꯑꯦꯖꯦꯟꯠ ꯀꯌꯥꯅ ꯇ꯭ꯔꯥꯚꯦꯜ, ꯕꯖꯦꯠꯀꯤ ꯔꯦꯟꯖ ꯅꯠꯇ꯭ꯔꯒ ꯂꯝꯊꯨꯡꯐꯝꯒꯤ ꯑꯈꯟꯅꯕ ꯃꯑꯣꯡꯗ ꯍꯦꯟꯅ ꯈꯨꯠꯂꯣꯢꯕꯁꯨ ꯌꯥꯎꯢ꯫'

# Saving sources[] and edited_targets_mni[] inside the en_mni_dataset folder
### .tsv (tab-separated values)

In [18]:
with open("en_mni_dataset.tsv", "w", encoding="utf-8") as f:
    for src, tgt in zip(sources, edited_targets_mni):
        f.write(f"{src}\t{tgt}\n")


# Save as .csv (Comma-Separated)

In [19]:
import csv

with open("en_mni_dataset.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["English", "MeiteiMayek"])
    for src, tgt in zip(sources, edited_targets_mni):
        writer.writerow([src, tgt])


# Save as JSON lines

In [20]:
import json

with open("en_mni_dataset.jsonl", "w", encoding="utf-8") as f:
    for src, tgt in zip(sources, edited_targets_mni):
        f.write(json.dumps({"source": src, "target": tgt}) + "\n")


In [21]:
import pandas as pd

In [22]:
df = pd.read_csv("en_mni_dataset.csv")
df.head()

Unnamed: 0,English,MeiteiMayek
0,Although most agencies are willing to take on ...,"ꯑꯗꯨꯝꯑꯣꯢꯅꯃꯛ, ꯑꯌꯥꯝꯕ ꯑꯦꯖꯦꯟꯁꯤꯁꯤꯡꯅ ꯃꯍꯧꯁꯥꯒꯤ ꯕꯨꯀꯤꯡꯒꯨꯝ..."
1,"If you're feeling more adventurous, take the o...","ꯑꯗꯣꯝꯅ ꯋꯥꯈꯜ ꯌꯥꯝꯅ ꯅꯨꯡꯉꯥꯢꯕ ꯑꯣꯢꯔꯒꯗꯤ, ꯁ꯭ꯃꯨꯊꯤ ꯅꯠꯇ꯭ꯔꯒ..."
2,"Airlines that offer these include Air Canada, ...",ꯍꯥꯢꯔꯤꯕ ꯀꯥꯟꯅꯕꯥ ꯑꯁꯤ ꯄꯤꯔꯤꯕ ꯑꯦꯌꯔꯂꯥꯢꯟꯁꯤꯡ ꯑꯗꯨꯗꯤ ꯑꯦꯌꯔ...
3,The number of users of the Yahoo! and Microsof...,ꯌꯥꯍꯨ! ꯑꯃꯁꯨꯡ ꯃꯥꯢꯀ꯭ꯔꯣꯁꯣꯐ ꯁꯔꯚꯤꯁꯦꯁꯀꯤ ꯌꯨꯖꯔꯁꯤꯡꯒꯤ ꯃꯁꯤ...
4,A triceratops' teeth would have been able to c...,ꯇ꯭ꯔꯥꯢꯁꯦꯔꯥꯇꯣꯞ ꯑꯃꯒꯤ ꯃꯌꯥꯅ ꯎꯅꯥꯁꯤꯡꯗ ꯅꯠꯇꯅ ꯌꯥꯝꯅ ꯀꯟꯕ ꯎ...


In [23]:
df.describe()

Unnamed: 0,English,MeiteiMayek
count,997,997
unique,997,997
top,Although most agencies are willing to take on ...,"ꯑꯗꯨꯝꯑꯣꯢꯅꯃꯛ, ꯑꯌꯥꯝꯕ ꯑꯦꯖꯦꯟꯁꯤꯁꯤꯡꯅ ꯃꯍꯧꯁꯥꯒꯤ ꯕꯨꯀꯤꯡꯒꯨꯝ..."
freq,1,1
