# Step1: Install Librarys

In [1]:
!pip install pyspark



In [2]:
!pip install datasets



# Step2: Import Librarys

In [3]:
import pyspark
from datasets import load_dataset
import pandas as pd
from collections import Counter
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, collect_list, concat_ws, udf
from pyspark.sql.types import StringType, ArrayType, MapType

# Step3: Download Dataset

In [4]:
dataset = load_dataset("persiannlp/parsinlu_translation_en_fa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Step4: Preprocessing Dataset

In [5]:
train_data = dataset['train']
test_data = dataset['test']
validation_data = dataset['validation']

#Step5: Spark Config

In [6]:
spark = SparkSession.builder.appName("TranslationWordCount")\
                            .config("spark.driver.memory", "10g")\
                            .config("spark.driver.maxResultSize", "10g")\
                            .getOrCreate()

# Step6: Convert Dataset to Spark DF

In [7]:
def dataset_to_spark_df(dataset):
    data = []
    for item in dataset:
        source = item['source']
        target = item['targets'][0]
        data.append((source, target))
    return spark.createDataFrame(data, ["English", "Persian"])

In [8]:
df_train = dataset_to_spark_df(train_data)
df_test = dataset_to_spark_df(test_data)
df_validation = dataset_to_spark_df(validation_data)

# Step7: Merge Datasets

In [9]:
df_combined = df_train.union(df_test).union(df_validation)

# Step8: Split English Words

In [10]:
df_words = df_combined.withColumn("EnglishWord", explode(split(col("English"), " ")))
df_grouped = df_words.groupBy("EnglishWord").agg(collect_list("Persian").alias("Meanings"))
df_grouped = df_grouped.withColumn("Meanings", concat_ws(" ", col("Meanings")))

# Step9: Calculate top 10 Meaning words

In [11]:
def split_and_count(text):
    from collections import Counter
    words = text.split()
    counter = Counter(words)
    total_words = sum(counter.values())
    top_10_words = sorted(counter.items(), key=lambda item: item[1] / total_words, reverse=True)[:10]
    return dict(top_10_words)

split_and_count_udf = udf(split_and_count, MapType(StringType(), StringType()))

df_grouped = df_grouped.withColumn("TopMeaningCounts", split_and_count_udf(col("Meanings")))

# Step10: Show Result

In [16]:
pd.set_option('display.max_rows', None)

In [25]:
df_English_Dictionary = df_grouped.limit(100000).toPandas()

In [27]:
df_English_Dictionary[:1000]

Unnamed: 0,EnglishWord,Meanings,TopMeaningCounts
0,!m,به او کمک کن تا بتواند از ما حمایت کند! پروردگ...,"{'کند!': '1', 'او': '1', 'ما': '1', 'کمک': '1'..."
1,"""'Nor",و به جای خدا چیزی را که سودی به تو نمی‌رساند و...,"{'چیزی': '1', 'خدا': '1', 'را': '1', 'جای': '1..."
2,"""'Seeing",در حالی که شما را مرحله به مرحله [خاک، نطفه، ع...,"{'در': '1', 'حالی': '1', 'مرحله': '2', 'را': '..."
3,"""(Come)",قومش با او به گفتگوی بی منطق و ستیز برخاستند، ...,"{'آنکه': '2', 'گفتگوی': '2', 'من': '2', 'او': ..."
4,"""(One",که از من و خاندان یعقوب ارث ببرد، و او را پرور...,"{'او': '1', 'یعقوب': '1', 'من': '1', 'را': '1'..."
5,"""(these",[مشرکان] گفتند: [نه، قرآن سحر نیست] بلکه خواب ...,"{'[نه]': '2', '[نه،': '1', 'هایی': '2', 'قرآن'..."
6,"""Aaron,",هارون، برادرم را,"{'برادرم': '1', 'هارون،': '1', 'را': '1'}"
7,"""Alas",[همسر ابراهیم] گفت: ای وای بر من! آیا فرزند آو...,"{'کسانی': '2', 'من': '2', '[همسر': '1', 'در': ..."
8,"""An","""یک روز معمولی در کیف.","{'کیف.': '1', 'در': '1', 'روز': '1', 'معمولی':..."
9,"""Art",گفتند: شگفتا! آیا تو خود یوسفی؟! گفت: من یوسفم...,"{'شگفتا!': '1', 'خدا': '2', 'من': '2', 'ما': '..."


# Step11: Calulate Persian Dictionary

In [28]:
df_words2 = df_combined.withColumn("PersianWord", explode(split(col("Persian"), " ")))
df_grouped2 = df_words2.groupBy("PersianWord").agg(collect_list("English").alias("Meanings"))
df_grouped2 = df_grouped2.withColumn("Meanings", concat_ws(" ", col("Meanings")))

def split_and_count(text):
    from collections import Counter
    words = text.split()
    counter = Counter(words)
    total_words = sum(counter.values())
    top_10_words = sorted(counter.items(), key=lambda item: item[1] / total_words, reverse=True)[:10]
    return dict(top_10_words)

split_and_count_udf = udf(split_and_count, MapType(StringType(), StringType()))

df_grouped2 = df_grouped2.withColumn("TopMeaningCounts", split_and_count_udf(col("Meanings")))

In [29]:
df_Persian_Dictionary = df_grouped2.limit(100000).toPandas()

In [30]:
df_Persian_Dictionary[:1000]

Unnamed: 0,PersianWord,Meanings,TopMeaningCounts
0,!!!يالا!,"come on! do it, do it, do it! come on!","{'it,': '2', 'come': '2', 'on!': '2', 'do': '3..."
1,!!البته,of course you will!,"{'will!': '1', 'course': '1', 'you': '1', 'of'..."
2,!!خون,blood!!,{'blood!!': '1'}
3,!،,red army! attack the white army's square in we...,"{'the': '1', 'red': '1', 'square': '1', 'white..."
4,!آزمون,exam ground for physician ladies only!,"{'exam': '1', 'for': '1', 'ladies': '1', 'only..."
5,!آنرا,i'll get the medicines to make his bathing wat...,"{'the': '1', 'bathing': '1', 'his': '1', 'medi..."
6,!آیا,aren't those astragalus sprouts?! did you... f...,"{'astragalus': '1', 'you...': '1', 'sprouts?!'..."
7,!اعلیحضرت!,your majesty! your majesty! your majesty! my l...,"{'lord!': '1', 'majesty!': '3', 'your': '3', '..."
8,!افسر!؟,"official?! yeah, right! it's an obvious trap!","{'yeah,': '1', 'trap!': '1', 'right!': '1', 'i..."
9,!الا.,ella. ella!,"{'ella!': '1', 'ella.': '1'}"
