In [1]:
from datasets import load_dataset

ds = load_dataset("shenasa/English-Persian-Parallel-Dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

dataset.tsv:   0%|          | 0.00/872M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3960172 [00:00<?, ? examples/s]

In [3]:
# Basic dataset info
print(ds)

# Check available splits
print(ds.keys())

# Check column names
print(ds["train"].column_names)

# Show dataset size
print("Train size:", len(ds["train"]))


DatasetDict({
    train: Dataset({
        features: ['flash fire .', 'فلاش آتش .'],
        num_rows: 3960172
    })
})
dict_keys(['train'])
['flash fire .', 'فلاش آتش .']
Train size: 3960172


In [5]:
# Print a few samples using raw column names
col_en, col_fa = ds["train"].column_names

for i in range(5):
    print("EN:", ds["train"][i][col_en])
    print("FA:", ds["train"][i][col_fa])
    print("-" * 50)


EN: superheats the air . burns the lungs like rice paper .
FA: هوا را فوق العاده گرم می کند . ریه ها را مثل کاغذ برنج می سوزاند .
--------------------------------------------------
EN: hey , guys . down here . down here .
FA: سلام بچه ها . این پایین . این پایین .
--------------------------------------------------
EN: what do you got down this corridor is the bow , right .
FA: چه چیزی در این راهرو پایین آمده است ، درست است .
--------------------------------------------------
EN: theres an access hatch right there that puts us into the bowthruster room .
FA: یک دریچه دسترسی درست در آنجا وجود دارد که ما را وارد اتاق کمان می کند .
--------------------------------------------------
EN: we get into the propeller tubes and the only thing between us and the outside .
FA: وارد لوله های پروانه می شویم و تنها چیزی که بین ما و بیرون است .
--------------------------------------------------


In [6]:
ds = ds.rename_column(col_en, "en")
ds = ds.rename_column(col_fa, "fa")

print(ds["train"].column_names)


['en', 'fa']


In [8]:
from datasets import DatasetDict

# Shuffle once with a fixed seed for reproducibility
ds_shuffled = ds["train"].shuffle(seed=42)

# Select only what we need: 24k total
small_ds = ds_shuffled.select(range(24000))

print("Subset size:", len(small_ds))


Subset size: 24000


In [10]:
for i in range(3):
    print("EN:", train_set[i]["en"])
    print("FA:", train_set[i]["fa"])
    print("-" * 50)


EN: http://www.youtradefx.com/ 29-Sep-2015 17:55:12 GMT
FA: http :// www . youtradefx . com / 29 - سپتامبر - 2015 17 : 55 : 12 GMT
--------------------------------------------------
EN: B.158
FA: B.158
--------------------------------------------------
EN: ...and maybe a mobile,you know, over the crib...
FA: ، و شاید از این چیز آویزون ها می دونی ، بالای گهواره
--------------------------------------------------


In [11]:
import re

def is_good_pair(en, fa):
    en = en.strip()
    fa = fa.strip()

    # 1. empty or identical
    if not en or not fa:
        return False
    if en == fa:
        return False

    # 2. URLs, emails
    if re.search(r"http|www\.|\.com|\.org|\.net", en.lower()):
        return False
    if re.search(r"http|www\.|\.com|\.org|\.net", fa.lower()):
        return False

    # 3. too many numbers (dates, IDs, timestamps)
    if len(re.findall(r"\d", en)) > 6:
        return False
    if len(re.findall(r"\d", fa)) > 6:
        return False

    # 4. token length
    en_len = len(en.split())
    fa_len = len(fa.split())

    if en_len < 4 or fa_len < 4:
        return False
    if en_len > 50 or fa_len > 50:
        return False

    # 5. language sanity
    # Persian must contain Persian characters
    if not re.search(r"[\u0600-\u06FF]", fa):
        return False

    # English should contain Latin letters
    if not re.search(r"[a-zA-Z]", en):
        return False

    return True


In [12]:
print("Before cleaning:", len(small_ds))

cleaned_ds = small_ds.filter(
    lambda x: is_good_pair(x["en"], x["fa"])
)

print("After cleaning:", len(cleaned_ds))
print("Removed:", len(small_ds) - len(cleaned_ds))
print("Remaining %:", round(len(cleaned_ds) / len(small_ds) * 100, 2))


Before cleaning: 24000


Filter:   0%|          | 0/24000 [00:00<?, ? examples/s]

After cleaning: 17240
Removed: 6760
Remaining %: 71.83


In [13]:
for i in range(5):
    print("EN:", cleaned_ds[i]["en"])
    print("FA:", cleaned_ds[i]["fa"])
    print("-" * 60)


EN: 1 Renmin Road (S), Section 2
FA: 1 جاده رنمین ( S )، بخش 2
------------------------------------------------------------
EN: After the chicken was put on paper towels, she went on out to the back porch with her guitar, sat down, and began to play.
FA: بعد از اینکه مرغ را روی دستمال کاغذی گذاشتند ، با گیتار به ایوان پشتی رفت و نشست و شروع به نواختن کرد .
------------------------------------------------------------
EN: To purchase the goods please choose the preferable way.
FA: برای خرید کالا لطفا راه ترجیحی را انتخاب کنید .
------------------------------------------------------------
EN: Ottawa-Hull 819994 **** Phone
FA: Ottawa - Hull 819994 **** تلفن
------------------------------------------------------------
EN: A venture capital fund management company is a company that only manages venture capital funds.
FA: شرکت مدیریت صندوق سرمایه گذاری خطرپذیر شرکتی است که فقط صندوق های سرمایه گذاری خطرپذیر را مدیریت می کند .
------------------------------------------------------------


In [14]:
def is_better_pair(en, fa):
    # reuse previous filter
    if not is_good_pair(en, fa):
        return False

    # ratio of letters to total length
    en_letters = len(re.findall(r"[a-zA-Z]", en))
    fa_letters = len(re.findall(r"[\u0600-\u06FF]", fa))

    if en_letters / max(len(en), 1) < 0.4:
        return False
    if fa_letters / max(len(fa), 1) < 0.4:
        return False

    return True


print("Before extra cleaning:", len(cleaned_ds))

cleaned_ds_v2 = cleaned_ds.filter(
    lambda x: is_better_pair(x["en"], x["fa"])
)

print("After extra cleaning:", len(cleaned_ds_v2))
print("Removed:", len(cleaned_ds) - len(cleaned_ds_v2))
print("Remaining %:", round(len(cleaned_ds_v2) / len(cleaned_ds) * 100, 2))


Before extra cleaning: 17240


Filter:   0%|          | 0/17240 [00:00<?, ? examples/s]

After extra cleaning: 16347
Removed: 893
Remaining %: 94.82


In [15]:
for i in range(5):
    print("EN:", cleaned_ds_v2[i]["en"])
    print("FA:", cleaned_ds_v2[i]["fa"])
    print("-" * 60)


EN: 1 Renmin Road (S), Section 2
FA: 1 جاده رنمین ( S )، بخش 2
------------------------------------------------------------
EN: After the chicken was put on paper towels, she went on out to the back porch with her guitar, sat down, and began to play.
FA: بعد از اینکه مرغ را روی دستمال کاغذی گذاشتند ، با گیتار به ایوان پشتی رفت و نشست و شروع به نواختن کرد .
------------------------------------------------------------
EN: To purchase the goods please choose the preferable way.
FA: برای خرید کالا لطفا راه ترجیحی را انتخاب کنید .
------------------------------------------------------------
EN: A venture capital fund management company is a company that only manages venture capital funds.
FA: شرکت مدیریت صندوق سرمایه گذاری خطرپذیر شرکتی است که فقط صندوق های سرمایه گذاری خطرپذیر را مدیریت می کند .
------------------------------------------------------------
EN: he can he can stay .
FA: او می تواند او می تواند بماند .
------------------------------------------------------------


In [17]:
# First split: test set (2k)
train_val_test = cleaned_ds_v2.train_test_split(test_size=2000, seed=42)

temp_train_val = train_val_test["train"]
test_set = train_val_test["test"]

# Second split: train / validation
train_val = temp_train_val.train_test_split(
    test_size=2000,
    seed=42
)

train_set = train_val["train"]
validation_set = train_val["test"]

print("Train size:", len(train_set))
print("Validation size:", len(validation_set))
print("Test size:", len(test_set))


Train size: 12347
Validation size: 2000
Test size: 2000


In [18]:
import os

os.makedirs("smt_data", exist_ok=True)

def write_parallel(dataset, src_lang, tgt_lang, prefix):
    with open(f"smt_data/{prefix}.{src_lang}", "w", encoding="utf-8") as f_src, \
         open(f"smt_data/{prefix}.{tgt_lang}", "w", encoding="utf-8") as f_tgt:
        for x in dataset:
            f_src.write(x[src_lang].strip() + "\n")
            f_tgt.write(x[tgt_lang].strip() + "\n")

# Write files
write_parallel(train_set, "en", "fa", "train")
write_parallel(validation_set, "en", "fa", "valid")
write_parallel(test_set, "en", "fa", "test")

print("Files written:")
print(os.listdir("smt_data"))


Files written:
['valid.en', 'valid.fa', 'test.en', 'train.fa', 'test.fa', 'train.en']


In [19]:
# Show first 3 lines from each file
for split in ["train", "valid", "test"]:
    print(f"\n--- {split.upper()} ---")
    with open(f"smt_data/{split}.en", encoding="utf-8") as f:
        print("EN:", f.readline().strip())
    with open(f"smt_data/{split}.fa", encoding="utf-8") as f:
        print("FA:", f.readline().strip())



--- TRAIN ---
EN: New artistic software is now available for download! AKVIS is glad to announce the release of AKVIS Draw.
FA: نرم افزار هنری جدید هم اکنون برای دانلود در دسترس است ! AKVIS خوشحال است که انتشار AKVIS Draw را اعلام می کند .

--- VALID ---
EN: Guadalajara 331492 **** Phone
FA: گوادالاخارا 331492 **** تلفن

--- TEST ---
EN: UVS-10 devices - products to transform the applied force of gravity to the mass of the weighed load.
FA: دستگاه های UVS - 10 - محصولاتی برای تبدیل نیروی گرانش اعمال شده به جرم بار وزن شده .


In [20]:
!apt-get update -qq
!apt-get install -y mosesdecoder


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate package mosesdecoder


In [22]:
%%bash

for split in train valid test
do
  cat smt_data/$split.en | \
  perl /usr/share/mosesdecoder/scripts/tokenizer/tokenizer.perl \
    -l en \
    -threads 2 \
  > smt_data/$split.tok.en
done

echo "English tokenization done."


English tokenization done.


Can't open perl script "/usr/share/mosesdecoder/scripts/tokenizer/tokenizer.perl": No such file or directory
Can't open perl script "/usr/share/mosesdecoder/scripts/tokenizer/tokenizer.perl": No such file or directory
Can't open perl script "/usr/share/mosesdecoder/scripts/tokenizer/tokenizer.perl": No such file or directory


In [23]:
!echo "Before:"
!head -n 2 smt_data/train.en

!echo -e "\nAfter:"
!head -n 2 smt_data/train.tok.en


Before:
New artistic software is now available for download! AKVIS is glad to announce the release of AKVIS Draw.
However I believe that the

After:


In [24]:
!pip install sacremoses


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [25]:
from sacremoses import MosesTokenizer

mt = MosesTokenizer(lang="en")

def tokenize_en_file(in_path, out_path):
    with open(in_path, encoding="utf-8") as fin, \
         open(out_path, "w", encoding="utf-8") as fout:
        for line in fin:
            fout.write(" ".join(mt.tokenize(line.strip())) + "\n")

for split in ["train", "valid", "test"]:
    tokenize_en_file(
        f"smt_data/{split}.en",
        f"smt_data/{split}.tok.en"
    )

print("English tokenization done (sacremoses).")


English tokenization done (sacremoses).


In [26]:
print("Before:")
with open("smt_data/train.en", encoding="utf-8") as f:
    for _ in range(2):
        print(f.readline().strip())

print("\nAfter:")
with open("smt_data/train.tok.en", encoding="utf-8") as f:
    for _ in range(2):
        print(f.readline().strip())


Before:
New artistic software is now available for download! AKVIS is glad to announce the release of AKVIS Draw.
However I believe that the

After:
New artistic software is now available for download ! AKVIS is glad to announce the release of AKVIS Draw .
However I believe that the


In [28]:
import os

os.makedirs("align_data", exist_ok=True)

def write_fastalign_file(dataset, path):
    with open(path, "w", encoding="utf-8") as f:
        for ex in dataset:
            en = ex["en"].strip()
            fa = ex["fa"].strip()
            if en and fa:
                f.write(f"{en} ||| {fa}\n")

write_fastalign_file(train_set, "align_data/train.fa_en")
write_fastalign_file(validation_set, "align_data/valid.fa_en")
write_fastalign_file(test_set,  "align_data/test.fa_en")

print("Files created:")
!wc -l align_data/*.fa_en


Files created:
   2000 align_data/test.fa_en
  12347 align_data/train.fa_en
   2000 align_data/valid.fa_en
  16347 total


In [29]:
%%bash
git clone https://github.com/clab/fast_align.git
cd fast_align
mkdir build
cd build
cmake ..
make


-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Could NOT find SparseHash (missing: SPARSEHASH_INCLUDE_DIR) 
-- Configuring done (1.3s)
-- Generating done (0.0s)
-- Build files have been written to: /content/fast_align/build
[ 16%] Building CXX object CMakeFiles/fast_align.dir/src/fast_align.cc.o
[ 33%] Building CXX object CMakeFiles/fast_align.dir/src/ttables.cc.o
[ 50%] Linking CXX executable fast_align
[ 50%] Built target fast_align
[ 66%] Building CXX object CMakeFiles/atools.dir/src/alignment_io.cc.o
[ 83%] Building CXX object CM

Cloning into 'fast_align'...
  cmake_minimum_required() should be called prior to this top-level project()
  call.  Please see the cmake-commands(7) manual for usage documentation of
  both commands.

  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.




In [30]:
%%bash
./fast_align/build/fast_align \
  -i align_data/train.fa_en \
  -d -o -v > align_data/forward.align


ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
............
expected target length = source length * 1.17485
ITERATION 1
............
  log_e likelihood: -4.5414e+06
  log_2 likelihood: -6.55186e+06
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.17219
       size counts: 921
ITERATION 2
............
  log_e likelihood: -1.35102e+06
  log_2 likelihood: -1.94911e+06
     cross entropy: 8.89414
        perplexity: 475.775
      posterior p0: 0.0734648
 posterior al-feat: -0.135132
       size counts: 921
  1  model al-feat: -0.147893 (tension=4)
  2  model al-feat: -0.142688 (tension=4.25521)
  3  model al-feat: -0.139728 (tension=4.40631)
  4  model al-feat: -0.137971 (tension=4.49823)
  5  model al-feat: -0.136903 (tension=4.55501)
  6  model al-feat: -0.136242 (tension=4.59042)
  7  model al-feat: -0.135831 (tension=4.61262)
  8  model al-feat: -0.135573 (tension=4.62658)
     final tension: 4.63539
ITERATION 3
............
  log_e likeliho

In [31]:
%%bash
./fast_align/build/fast_align \
  -i align_data/train.fa_en \
  -d -o -v -r > align_data/reverse.align


ARG=i
ARG=d
ARG=o
ARG=v
ARG=r
INITIAL PASS 
............
expected target length = source length * 0.897335
ITERATION 1
............
  log_e likelihood: -3.96434e+06
  log_2 likelihood: -5.71933e+06
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.173789
       size counts: 921
ITERATION 2
............
  log_e likelihood: -1.36233e+06
  log_2 likelihood: -1.96543e+06
     cross entropy: 10.2741
        perplexity: 1238.29
      posterior p0: 0.0637674
 posterior al-feat: -0.125729
       size counts: 921
  1  model al-feat: -0.196448 (tension=4)
  2  model al-feat: -0.161182 (tension=5.41437)
  3  model al-feat: -0.147287 (tension=6.12344)
  4  model al-feat: -0.139829 (tension=6.55458)
  5  model al-feat: -0.135309 (tension=6.83658)
  6  model al-feat: -0.132387 (tension=7.02818)
  7  model al-feat: -0.130423 (tension=7.16134)
  8  model al-feat: -0.129071 (tension=7.25522)
     final tension: 7.32205
ITERATION 3
............
  log_e

In [32]:
with open("align_data/train.fa_en", encoding="utf-8") as f_data, \
     open("align_data/forward.align") as f_align:

    for i in range(5):
        print("SENTENCE:")
        print(f_data.readline())
        print("ALIGNMENT:")
        print(f_align.readline())
        print("-" * 50)


SENTENCE:
New artistic software is now available for download! AKVIS is glad to announce the release of AKVIS Draw. ||| نرم افزار هنری جدید هم اکنون برای دانلود در دسترس است ! AKVIS خوشحال است که انتشار AKVIS Draw را اعلام می کند .

ALIGNMENT:
2-0 2-1 1-2 0-3 4-4 4-5 6-6 8-7 5-8 5-9 9-10 8-11 8-12 10-13 9-14 10-15 12-16 16-17 16-18 14-19 15-20 14-22 15-23

--------------------------------------------------
SENTENCE:
However I believe that the ||| با این حال من معتقدم که

ALIGNMENT:
0-0 0-1 0-2 1-3 2-4 3-5

--------------------------------------------------
SENTENCE:
Rochester (New York) 585521 **** Phone ||| روچستر ( نیویورک ) 585521 **** تلفن

ALIGNMENT:
0-0 1-1 2-2 2-3 3-4 4-5 5-6

--------------------------------------------------
SENTENCE:
The book includes a full range of parts and Assembly. [Read More] ||| این کتاب شامل طیف کاملی از قطعات و مونتاژ است . [ ادامه مطلب ]

ALIGNMENT:
0-0 1-1 2-2 5-3 2-4 4-5 7-6 8-7 7-8 7-9 9-10 10-11 10-12 10-13 11-14

-------------------------------

In [33]:
def load_alignments(path):
    aligns = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            pairs = set(tuple(map(int, p.split("-"))) for p in line.strip().split())
            aligns.append(pairs)
    return aligns

forward = load_alignments("align_data/forward.align")
reverse = load_alignments("align_data/reverse.align")

intersection = [f & r for f, r in zip(forward, reverse)]

# Save intersection alignment
with open("align_data/intersection.align", "w", encoding="utf-8") as f:
    for sent in intersection:
        f.write(" ".join(f"{i}-{j}" for i, j in sorted(sent)) + "\n")

print("Intersection alignment saved.")


Intersection alignment saved.


In [34]:
!head -n 3 align_data/intersection.align


0-3 1-2 2-1 4-5 5-9 6-6 8-12 9-14 10-13 12-16 16-17
0-2 1-3 2-4 3-5
0-0 2-2 3-4 4-5 5-6


In [36]:
from sacremoses import MosesTokenizer

mt_fa = MosesTokenizer(lang="fa")

def tokenize_fa_file(in_path, out_path):
    with open(in_path, encoding="utf-8") as fin, \
         open(out_path, "w", encoding="utf-8") as fout:
        for line in fin:
            fout.write(" ".join(mt_fa.tokenize(line.strip())) + "\n")

for split in ["train", "valid", "test"]:
    tokenize_fa_file(
        f"smt_data/{split}.fa",
        f"smt_data/{split}.tok.fa"
    )

print("Persian tokenization done.")


Persian tokenization done.


In [37]:
print("BEFORE:")
print(open("smt_data/train.fa", encoding="utf-8").readline())

print("\nAFTER:")
print(open("smt_data/train.tok.fa", encoding="utf-8").readline())


BEFORE:
نرم افزار هنری جدید هم اکنون برای دانلود در دسترس است ! AKVIS خوشحال است که انتشار AKVIS Draw را اعلام می کند .


AFTER:
نرم افزار هنری جدید هم اکنون برای دانلود در دسترس است ! AKVIS خوشحال است که انتشار AKVIS Draw را اعلام می کند .



In [38]:
def load_lines(path):
    with open(path, encoding="utf-8") as f:
        return [line.strip().split() for line in f]

en_sents = load_lines("smt_data/train.tok.en")
fa_sents = load_lines("smt_data/train.tok.fa")

print("EN sentences:", len(en_sents))
print("FA sentences:", len(fa_sents))


EN sentences: 12347
FA sentences: 12347


In [39]:
def load_alignment_pairs(path):
    alignments = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            pairs = []
            for p in line.strip().split():
                i, j = p.split("-")
                pairs.append((int(i), int(j)))
            alignments.append(pairs)
    return alignments

alignments = load_alignment_pairs("align_data/intersection.align")

print("Alignments:", len(alignments))


Alignments: 12347


In [40]:
from collections import defaultdict

MAX_PHRASE_LEN = 7
phrase_counts = defaultdict(int)
en_counts = defaultdict(int)

for en, fa, aligns in zip(en_sents, fa_sents, alignments):
    if not aligns:
        continue

    for e_start in range(len(en)):
        for e_end in range(e_start, min(e_start + MAX_PHRASE_LEN, len(en))):
            aligned_f = [j for i, j in aligns if e_start <= i <= e_end]
            if not aligned_f:
                continue

            f_start, f_end = min(aligned_f), max(aligned_f)

            # alignment consistency check
            consistent = True
            for i, j in aligns:
                if f_start <= j <= f_end and not (e_start <= i <= e_end):
                    consistent = False
                    break
            if not consistent:
                continue

            en_phrase = " ".join(en[e_start:e_end + 1])
            fa_phrase = " ".join(fa[f_start:f_end + 1])

            phrase_counts[(en_phrase, fa_phrase)] += 1
            en_counts[en_phrase] += 1


In [41]:
phrase_table = []

for (en_p, fa_p), count in phrase_counts.items():
    prob = count / en_counts[en_p]
    phrase_table.append((en_p, fa_p, prob))

print("Total phrase pairs learned:", len(phrase_table))


Total phrase pairs learned: 681115


In [42]:
for en_p, fa_p, prob in phrase_table[:10]:
    print(f"{en_p} ||| {fa_p} ||| {prob:.3f}")


New ||| جدید ||| 0.209
New artistic ||| هنری جدید ||| 1.000
New artistic software ||| افزار هنری جدید ||| 1.000
New artistic software is ||| افزار هنری جدید ||| 1.000
New artistic software is now ||| افزار هنری جدید هم اکنون ||| 1.000
New artistic software is now available for ||| افزار هنری جدید هم اکنون برای دانلود در دسترس ||| 1.000
artistic ||| هنری ||| 0.500
artistic software ||| افزار هنری ||| 1.000
artistic software is ||| افزار هنری ||| 1.000
software ||| افزار ||| 0.161


In [43]:
from collections import defaultdict

phrase_dict = defaultdict(list)

for en_p, fa_p, prob in phrase_table:
    phrase_dict[en_p].append((fa_p, prob))


In [44]:
def decode_sentence(en_sentence, max_len=7):
    tokens = en_sentence.split()
    i = 0
    output = []

    while i < len(tokens):
        found = False

        for l in range(min(max_len, len(tokens) - i), 0, -1):
            phrase = " ".join(tokens[i:i+l])
            if phrase in phrase_dict:
                # pick highest probability translation
                fa_phrase = max(phrase_dict[phrase], key=lambda x: x[1])[0]
                output.append(fa_phrase)
                i += l
                found = True
                break

        if not found:
            # fallback: copy word
            output.append(tokens[i])
            i += 1

    return " ".join(output)


In [46]:
valid_en = [ex["en"] for ex in validation_set]
valid_fa = [ex["fa"] for ex in validation_set]


In [47]:
for i in range(5):
    en = valid_en[i]
    print("EN:", en)
    print("SMT:", decode_sentence(en))
    print("REF:", valid_fa[i])
    print("-" * 50)


EN: Guadalajara 331492 **** Phone
SMT: گوادالاخارا 331492 **** تلفن
REF: گوادالاخارا 331492 **** تلفن
--------------------------------------------------
EN: Okay, so this mind flamer thing...
SMT: Okay, خیلی این ذهن flamer thing...
REF: خیلی خب ، پس این ذهن گول زن
--------------------------------------------------
EN: And charged them that they should not make him known:
SMT: و هزینه که آن ها نباید . او را known:
REF: و به آن ها دستور داد که او را نشناسند :
--------------------------------------------------
EN: for upon every man and the cattle which be found in the field, and shall not be brought home, the hail shall come down upon them, and they shall die.
SMT: برای بر ملاقات و و که در را field, قوی نخواهد آوردند home, را تگرگ خواهد بر them, شنید die.
REF: زیرا بر هر انسان و چهارپایانی که در مزرعه یافت می شوند و به خانه باز نمی گردند ، تگرگ بر آن ها فرود می آید و خواهند مرد .
--------------------------------------------------
EN: outskirts of town, quiet location, meadowlands, right 

In [51]:
from IPython.testing import test
test_en = [ex["en"] for ex in test_set]
test_fa = [ex["fa"] for ex in test_set]


In [52]:
from sacrebleu import corpus_bleu

hypotheses = [decode_sentence(en) for en in test_en]
references = [[fa] for fa in test_fa]

bleu = corpus_bleu(hypotheses, references)
print("SMT EN→FA BLEU:", bleu.score)


SMT EN→FA BLEU: 5.050026728173538


In [49]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [55]:
from sacrebleu.metrics import CHRF, BLEU
from bert_score import score

# CHRF++
chrf_metric = CHRF(char_order=6, word_order=2)
chrf_score = chrf_metric.corpus_score(hypotheses, [test_fa])
print("SMT EN→FA chrF++:", chrf_score.score)

# BERTScore F1
P, R, F1 = score(hypotheses, test_fa, lang="fa", verbose=True)
print("SMT EN→FA BERTScore F1:", F1.mean().item())


SMT EN→FA chrF++: 23.727489705026855


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/63 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/32 [00:00<?, ?it/s]

done in 8.56 seconds, 233.55 sentences/sec
SMT EN→FA BERTScore F1: 0.748884916305542


In [54]:
!pip install bert-score


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13
