#Membuat Paralel Corpus Menggunakan RBMT

##Import Library

In [None]:
import csv
import requests
import io

##Inisialisasi Variabel dan Dataset Penerjemah

In [None]:
sunda_csv_url = "https://raw.githubusercontent.com/Bhayazeed/MT-Sunda/main/indo_sunda_dictionary.csv"
input_csv_url = "https://raw.githubusercontent.com/Bhayazeed/MT-Sunda/main/target_MT.csv"
output_csv_path = "translated_output.csv"

##Fungsi Load Lexicon/Kamus

In [None]:
def load_indonesian_sunda_lexicon(url):
    lexicon = {}
    try:
        response = requests.get(url)
        response.raise_for_status()
        text = response.text
        reader = csv.DictReader(io.StringIO(text))

        for row in reader:
            sunda_word = row['Sundanese'].strip()
            indonesian_word = row['Indonesian'].strip()

            # Gunakan kata Sunda pertama jika dipisah '/'
            sunda_variants = sunda_word.split('/')
            first_sunda = sunda_variants[0].strip().lower()

            for variant in indonesian_word.split(','):
                indonesian_variant = variant.strip().lower()
                if indonesian_variant:
                    lexicon[indonesian_variant] = first_sunda

    except Exception as e:
        print(f"❌ Gagal memuat lexicon: {e}")
    return lexicon

`def load_indonesian_sunda_lexicon(url):`

def: mendefinisikan fungsi bernama load_indonesian_sunda_lexicon

url: parameter string berisi alamat CSV publik (mis. GitHub raw URL)

    lexicon = {}
Membuat dictionary kosong untuk menampung pasangan indonesian → sundanese.

    try:
        response = requests.get(url)
        response.raise_for_status()

requests.get(url): kirim HTTP GET untuk ambil isi CSV dari url.

raise_for_status(): jika status kode bukan 200, akan memunculkan error agar masuk ke blok except.


        text = response.text
Ambil isi teks respons (CSV) sebagai satu string.


        reader = csv.DictReader(io.StringIO(text))
io.StringIO(text): ubah string CSV jadi file-like object.

csv.DictReader(...): baca CSV, tiap baris jadi dict dengan key sesuai header: ['Indonesian','Sundanese'].


        for row in reader:
            sunda_word = row['Sundanese'].strip()
            indonesian_word = row['Indonesian'].strip()
Loop tiap baris, ambil:

row['Sundanese']: kata dalam bahasa Sunda

row['Indonesian']: kata dalam Bahasa Indonesia

strip(): hapus spasi ekstra di awal/akhir.

            # Gunakan kata Sunda pertama jika dipisah '/'
            sunda_variants = sunda_word.split('/')
            first_sunda = sunda_variants[0].strip().lower()
Jika di CSV satu cell Sunda berisi “neda/sangu”, kita split by / dan ambil varian pertama, lalu lowercase.

            for variant in indonesian_word.split(','):
                indonesian_variant = variant.strip().lower()
                if indonesian_variant:
                    lexicon[indonesian_variant] = first_sunda
Untuk cell Indonesia yang mungkin berisi banyak variant misalnya “makan, makanlah”:

1. split per `,`

2. `strip()` + `lower()`

3. Simpan di lexicon`[indonesian_word]` = sunda_word_terpilih

    `except Exception as e:`
        print(f"❌ Gagal memuat lexicon: {e}")
    `return lexicon`
    
Jika error (URL salah, CSV corrupt, dsb.) akan dicetak pesan.

Fungsi selalu return dictionary lexicon.

##Fungsi RBMT

In [None]:
def rbmt(sentence, lexicon):
    tokens = sentence.lower().split()
    translated = [lexicon.get(token, token) for token in tokens]
    return ' '.join(translated)

##Kode Utama menjalankan Kode

In [2]:
if __name__ == "__main__":
  try:
    # Load lexicon dari URL
    indonesian_sunda_lexicon = load_indonesian_sunda_lexicon(sunda_csv_url)

    # Ambil isi file input dari GitHub
    response = requests.get(input_csv_url)
    response.raise_for_status()
    input_text = response.text

    reader = csv.DictReader(io.StringIO(input_text))

    with open(output_csv_path, mode='w', encoding='utf-8', newline='') as outfile:
        fieldnames = ['id', 'original_text', 'translated_text']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        print("=== TERJEMAHAN DIMULAI ===\n")
        for row in reader:
            original_text = row['text']
            translated_text = rbmt(original_text, indonesian_sunda_lexicon)

            writer.writerow({
                'id': row['id'],
                'original_text': original_text,
                'translated_text': translated_text
            })

            print(f"ID {row['id']}:")
            print(f"Original: {original_text}")
            print(f"Translated: {translated_text}")
            print("-" * 50)

    print(f"\n✅ Hasil terjemahan disimpan di: {output_csv_path}")

  except Exception as e:
    print(f"❌ Terjadi kesalahan: {e}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Translated: truk . they hanya started operating there , bangunan hareup hotel acapella . kedai asal deukeut susukan buloh . emang ti heubeul makanan barat manehna tk pernah mengecewakan !
--------------------------------------------------
ID 11512:
Original: anda ke bandung ? jika tidak coba masakan queen restorant ? sangat-sangat sayang . queen restoran sudah sejak lama sekali . mungkin di atas 40 tahun lalu sudah ada
Translated: anda ke bandung ? jika teu cobi masakan queen restorant ? sangat-sangat tresna . queen restoran enggeus sejak lila sekali . meureun di luhur 40 tahun lalu enggeus aya
--------------------------------------------------
ID 11513:
Original: minta dalil agama soal haram nya lgbt , jawaban karni ilyas bungkam ade armando
Translated: pundut dalil agama soal haram nya lgbt , jawaban karni ilyas cicing ade armando
--------------------------------------------------
ID 11514:
Original: karena enggan kelua

In [1]:
import pandas as pd
from collections import defaultdict

# 1. Baca CSV
df = pd.read_csv('https://raw.githubusercontent.com/Bhayazeed/MT-Sunda/refs/heads/main/translated_output.csv')  # Ganti dengan path ke file-mu

# 2. Inisialisasi dict untuk menghitung frekuensi
translation_dict = defaultdict(lambda: defaultdict(int))

# 3. Iterasi tiap baris parallel corpus
for _, row in df.iterrows():
    indo_tokens = str(row['original_text']).lower().split()
    sun_tokens  = str(row['translated_text']).lower().split()
    for w1 in indo_tokens:
        for w2 in sun_tokens:
            translation_dict[w1][w2] += 1

# 4. Hitung dan cetak probabilitas P(w2|w1)
for w1, targets in translation_dict.items():
    total = sum(targets.values())
    for w2, cnt in targets.items():
        prob = cnt / total
        print(f"P({w2}|{w1}) = {prob}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
P(masih|sasirangan) = 0.022727272727272728
P(loba|sasirangan) = 0.022727272727272728
P(lagi|sasirangan) = 0.022727272727272728
P(yang|sasirangan) = 0.022727272727272728
P(sejena|sasirangan) = 0.022727272727272728
P(.|sasirangan) = 0.022727272727272728
P(di|jukung) = 0.022727272727272728
P(pesona|jukung) = 0.022727272727272728
P(festival|jukung) = 0.06818181818181818
P(pasar|jukung) = 0.022727272727272728
P(terapung|jukung) = 0.022727272727272728
P(,|jukung) = 0.1590909090909091
P(aya|jukung) = 0.022727272727272728
P(juga|jukung) = 0.022727272727272728
P(permainan|jukung) = 0.022727272727272728
P(tradisional|jukung) = 0.045454545454545456
P(balogo|jukung) = 0.022727272727272728
P(kaen|jukung) = 0.022727272727272728
P(sasirangan|jukung) = 0.022727272727272728
P(kompetisi|jukung) = 0.022727272727272728
P(chef|jukung) = 0.022727272727272728
P(hotel|jukung) = 0.022727272727272728
P(dan|jukung) = 0.045454545454545456
P(restoran