In [1]:
from datasets import load_dataset, VerificationMode
from pprint import pprint
import polars as pl

In [2]:
raw_file = "/home/alif/Codes/ml-eng-assessment/src/data/processed/cleaned_shards/combined_shards.parquet"
lf_raw = pl.scan_parquet(raw_file)


lf_raw.show()

src,tgt,prefix
str,str,str
"""Amboih, lahabau punye skandal …","""How has the 1MDB scandal damag…","""terjemah ke Inggeris: """
"""The following MERGE statement …","""Pernyataan MERGE berikut denga…","""terjemah ke Melayu: """
"""Pertama, pasang kebergantungan…","""Bah, kunuk sia mau kasi tau ma…","""terjemah ke sabah: """
"""对于任何混淆或误解，我深表歉意。根据提供的知识库，它似乎是在…","""I apologize for any confusion …","""terjemah ke Inggeris: """
"""Anda diberi tugas untuk melaks…","""You are tasked with implementi…","""terjemah ke Inggeris: """


In [3]:
dialects = (
    lf_raw.with_columns(
        pl.col("prefix").str.replace(":", "").str.split(" ").list.get(-2).alias("dialects")
    )
)
dialects.show(20)


src,tgt,prefix,dialects
str,str,str,str
"""Amboih, lahabau punye skandal …","""How has the 1MDB scandal damag…","""terjemah ke Inggeris: ""","""Inggeris"""
"""The following MERGE statement …","""Pernyataan MERGE berikut denga…","""terjemah ke Melayu: ""","""Melayu"""
"""Pertama, pasang kebergantungan…","""Bah, kunuk sia mau kasi tau ma…","""terjemah ke sabah: ""","""sabah"""
"""对于任何混淆或误解，我深表歉意。根据提供的知识库，它似乎是在…","""I apologize for any confusion …","""terjemah ke Inggeris: ""","""Inggeris"""
"""Anda diberi tugas untuk melaks…","""You are tasked with implementi…","""terjemah ke Inggeris: ""","""Inggeris"""
"""Kami ni tengah dok usya aplika…","""We are working with an Ember.j…","""terjemah ke Inggeris: ""","""Inggeris"""
"""Create a function in Python th…","""Cipta fungsi dalam Python yang…","""terjemah ke Melayu: ""","""Melayu"""
"""You are tasked with implementi…","""Anda bertanggungjawab untuk me…","""terjemah ke Melayu: ""","""Melayu"""
"""You are given a Python script …","""Anda diberikan skrip Python ya…","""terjemah ke Melayu: ""","""Melayu"""
"""You are tasked with creating a…","""Anda diminta untuk membuat pro…","""terjemah ke Melayu: ""","""Melayu"""


In [4]:
dialects_only = dialects
dialects_only.show(20)
print(dialects_only.select(pl.col("dialects").len()).collect().item())




src,tgt,prefix,dialects
str,str,str,str
"""Amboih, lahabau punye skandal …","""How has the 1MDB scandal damag…","""terjemah ke Inggeris: ""","""Inggeris"""
"""The following MERGE statement …","""Pernyataan MERGE berikut denga…","""terjemah ke Melayu: ""","""Melayu"""
"""Pertama, pasang kebergantungan…","""Bah, kunuk sia mau kasi tau ma…","""terjemah ke sabah: ""","""sabah"""
"""对于任何混淆或误解，我深表歉意。根据提供的知识库，它似乎是在…","""I apologize for any confusion …","""terjemah ke Inggeris: ""","""Inggeris"""
"""Anda diberi tugas untuk melaks…","""You are tasked with implementi…","""terjemah ke Inggeris: ""","""Inggeris"""
"""Kami ni tengah dok usya aplika…","""We are working with an Ember.j…","""terjemah ke Inggeris: ""","""Inggeris"""
"""Create a function in Python th…","""Cipta fungsi dalam Python yang…","""terjemah ke Melayu: ""","""Melayu"""
"""You are tasked with implementi…","""Anda bertanggungjawab untuk me…","""terjemah ke Melayu: ""","""Melayu"""
"""You are given a Python script …","""Anda diberikan skrip Python ya…","""terjemah ke Melayu: ""","""Melayu"""
"""You are tasked with creating a…","""Anda diminta untuk membuat pro…","""terjemah ke Melayu: ""","""Melayu"""


1247965


In [13]:
import re
from collections import Counter, defaultdict
import polars as pl
import json

df = dialects_only.collect() if hasattr(dialects_only, "collect") else dialects_only

def normalize_words(text):
    words = re.findall(r"[a-z]+", text.lower())
    return [w for w in words if 3 <= len(w) <= 12]

dialect_word_counts = defaultdict(Counter)

for row in df.iter_rows(named=True):
    dialect = row["dialects"]
    words = normalize_words(row["tgt"])
    dialect_word_counts[dialect].update(words)

word_dialect_count = Counter()
for counter in dialect_word_counts.values():
    for word in counter:
        word_dialect_count[word] += 1

MIN_FREQ = 30    
MAX_DIALECTS = 2  
MAX_WORDS = 150

dialect_dict = {}

for dialect, counter in dialect_word_counts.items():
    filtered_words = [
        w for w, f in counter.items()
        if f >= MIN_FREQ and word_dialect_count[w] <= MAX_DIALECTS
    ]
    sorted_words = sorted(filtered_words, key=lambda w: counter[w], reverse=True)
    dialect_dict[dialect] = sorted_words[:MAX_WORDS]

print("Dialect dictionaries:\n")
for d, words in sorted(dialect_dict.items()):
    print(f"{d:15s} | {len(words):4d} words | sample -> {words[:15]}")


with open("dialect_dict.json", "w") as f:
    json.dump(dialect_dict, f, indent=4)


Dialect dictionaries:

Inggeris        |  150 words | sample -> ['given', 'have', 'these', 'why', 'there', 'knowledge', 'will', 'provide', 'which', 'their', 'also', 'its', 'need', 'such', 'been']
Melayu          |  150 words | sample -> ['bagaimanakah', 'bolehkah', 'terdapat', 'tetapi', 'bertanya', 'melakukannya', 'dahulu', 'menyaring', 'sahaja', 'ditambahkan', 'jujukan', 'terkecil', 'menjana', 'faktorial', 'mengurutkan']
johor           |   53 words | sample -> ['selapnyer', 'mencangut', 'kemahak', 'bingit', 'hempok', 'dusa', 'mengkang', 'hampang', 'lokoh', 'akak', 'biol', 'menggelupor', 'tuto', 'apenye', 'ponge']
kedah           |   75 words | sample -> ['macamana', 'pasaipa', 'baloq', 'loqlaq', 'kawaq', 'awat', 'satgi', 'hampa', 'punyalaa', 'ligan', 'gelong', 'mampoih', 'punyala', 'belajaq', 'pasempa']
kelantan        |  150 words | sample -> ['bakpo', 'ambo', 'kelate', 'cakno', 'pehe', 'macey', 'tubik', 'guano', 'mugo', 'rhoyak', 'gedebe', 'selok', 'nnapah', 'kelih', 'nnakut']
mela

In [14]:
dialect_dict_sets = {k: set(v) for k, v in dialect_dict.items()}

def detect_dialect(text):
    """Efficient dialect detection using set intersection instead of linear search."""
    words = set(normalize_words(text))
    max_hits = 0
    max_dialect = None
    for dialect, dict_words in dialect_dict_sets.items():
        # Intersect: much faster for larger dict_words
        hits = len(words & dict_words)
        if hits > max_hits:
            max_hits = hits
            max_dialect = dialect
    if max_hits > 0:
        return max_dialect
    else:
        return "Unknown"


In [18]:
transformed = (
    dialects_only.with_columns(
        pl.col("src").map_elements(detect_dialect).alias("predicted_src"),
    ).select(
        "src",
        "tgt",
        "dialects",
        "predicted_tgt",
        "predicted_src"
    )
)

In [16]:
lel = transformed.filter(
    pl.col("predicted_src") != 'Unknown',
    pl.col("predicted_tgt") != 'Unknown'
)


lel.show()

src,tgt,dialects,predicted_tgt,predicted_src
str,str,str,str,str
"""Amboih, lahabau punye skandal …","""How has the 1MDB scandal damag…","""Inggeris""","""Inggeris""","""melaka"""
"""The following MERGE statement …","""Pernyataan MERGE berikut denga…","""Melayu""","""Melayu""","""Inggeris"""
"""You are tasked with implementi…","""Anda bertanggungjawab untuk me…","""Melayu""","""Melayu""","""Inggeris"""
"""You are tasked with creating a…","""Anda diminta untuk membuat pro…","""Melayu""","""Melayu""","""Inggeris"""
"""In Python, I have an enumerati…","""Dalam Python, saya mempunyai s…","""Melayu""","""Melayu""","""Inggeris"""


In [17]:
lel.select(pl.col("predicted_src").len()).collect().item()

343652

In [21]:
(
    lel.with_columns(
        pl.concat_str([pl.col("predicted_src"), pl.col("dialects")], separator="_").alias("src_tgt_pair")
    )
    .group_by("src_tgt_pair")
    .agg(
        pl.col("src_tgt_pair").count().alias("src_tgt_pair_count")
    )
    .sort("src_tgt_pair_count", descending=True)
    .show(100)
)

src_tgt_pair,src_tgt_pair_count
str,u32
"""Melayu_Inggeris""",123160
"""Inggeris_Melayu""",122040
"""Melayu_Melayu""",52471
"""Inggeris_Inggeris""",3722
"""sarawak_Inggeris""",2017
"""Inggeris_sarawak""",1989
"""terengganu_Inggeris""",1984
"""sabah_Inggeris""",1791
"""kelantan_Inggeris""",1760
"""Inggeris_sabah""",1738
