# Step1: Install Librarys

In [1]:
!pip install pyspark



In [2]:
!pip install wikipedia



# Step2: Import Librarys

In [9]:
import pyspark
import wikipedia
import numpy as np
import re
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number
# import logging
# logging.basicConfig(level=logging.DEBUG)


# Step3: Download Dataset

In [11]:
dataset = wikipedia.page('Python (programming language)').content
dataset2 = ["to be or not to be this is the problem"]

# Step4: Preprocessing Dataset

In [12]:
dataset = re.sub(r'[\n\t]', ' ', dataset)
dataset = re.sub(r'\s+', ' ', dataset.strip())

dataset = dataset.split()
dataset = list(set(dataset))

# Step5: Config Spark

In [6]:
# sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName("EditDistanceApp").getOrCreate()

# Step6: Edit Distance method

In [7]:
def edit_distance(word1, word2):
    num_rows = len(word1)
    num_cols = len(word2)

    matrix = np.empty((num_rows + 2, num_cols + 2), dtype=object)

    for i in range(num_rows + 2):
        for j in range(num_cols + 2):
            if i == 0 and j == 0:
                matrix[i, j] = '#'
            elif i == 0 and j == 1:
                matrix[i, j] = '_'
            elif i == 1 and j == 0:
                matrix[i, j] = '_'
            elif i == 0:
                matrix[i, j] = word2[j - 2] # row word
            elif j == 0:
                matrix[i, j] = word1[i - 2] # col word
            elif i == 1:
                matrix[i, j] = str(j - 1)
            elif i != 1 and j == 1:
                matrix[i, j] = str(i - 1)
            else:
                first = (int(matrix[i-1, j])+1)
                second = (int(matrix[i, j-1])+1)
                if matrix[i, 0] == matrix[0, j]:
                    third = (int(matrix[i-1, j-1]))
                else :
                    third = (int(matrix[i-1, j-1])+2)
                matrix[i, j] = str(min(first, second, third))
    return matrix[num_rows + 1, num_cols + 1]

print(edit_distance("INTENTION", "EXECUTION"))


8


# Step7: Calculate Edit distance for all words

In [14]:
dataset2 = ["Ahmad", "Asghar", "Akbar", "Ehsan", "Mohsen"]

rdd = spark.sparkContext.parallelize(dataset)

word_pairs = rdd.cartesian(rdd).filter(lambda x: x[0]!= x[1])\
                               .map(lambda x: (x[0], x[1], edit_distance(x[0], x[1])))

df = word_pairs.toDF(["Word1", "Word2", "EditDistance"])

# df = df.toPandas()

window_spec = Window.partitionBy("Word1").orderBy("EditDistance")

df_with_row_num = df.withColumn("row_num", row_number().over(window_spec))

result_df = df_with_row_num.filter(col("row_num") == 1).drop("row_num")

result_df.show()

+-----------------+----------+------------+
|            Word1|     Word2|EditDistance|
+-----------------+----------+------------+
|              "2"| "spam={0}|          10|
|        "2.7.18+"|         ?|          10|
|            "22".|     state|          10|
|      "@-quoting"|       ten|          10|
|              "AI|        AI|           1|
|"BDFL-emeritus").|      emit|          13|
|          "Hello,|   largely|          10|
|               "I|         I|           1|
|          "PyAIML|       'u'|          10|
|          "Python|    Python|           1|
|       "Pythonic"|        of|          10|
|   "Pythonistas".|  CPython,|          10|
|             "The|       The|           1|
|              "To| "spam={0}|          10|
|         "adding"|      e.g.|          10|
|              "as|        as|           1|
|      "backported|backported|           1|
|           "bar".|    blocks|          10|
|       "batteries|    Server|          10|
|      "benevolent|   violate|  