In [4]:
import pyspark
import os
import sys
from pyspark import SparkContext
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, trim, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover

spark = SparkSession.builder \
    .appName("EntityResolutionPreprocessing").master("local[*]").getOrCreate()

df=spark.read.csv('sample_entities.csv',header=True,inferSchema=True)

print("Original DataFrame:")
df.show()

# Step 1: Data Cleaning
df_cleaned = df.withColumn("cleaned_name", lower(col("name"))) \
               .withColumn("cleaned_name", regexp_replace(col("cleaned_name"), "[^a-zA-Z0-9\\s]", "")) \
               .withColumn("cleaned_name", trim(col("cleaned_name")))

print("Cleaned DataFrame:")
df_cleaned.show()

# Step 2: Tokenization
tokenizer = Tokenizer(inputCol="cleaned_name", outputCol="tokens")
df_tokenized = tokenizer.transform(df_cleaned)

print("Tokenized DataFrame:")
df_tokenized.show(truncate=False)

# Step 3: Remove Stop Words
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
df_filtered = stopwords_remover.transform(df_tokenized)

print("Filtered Tokens DataFrame:")
df_filtered.show(truncate=False)

# Step 4: Normalization
df_normalized = df_filtered.withColumn("normalized_name", concat_ws(" ", col("filtered_tokens")))

print("Normalized DataFrame:")
df_normalized.show(truncate=False)

# Stop the Spark session
spark.stop()




Original DataFrame:
+-------------+
|         name|
+-------------+
|     John Doe|
|     john doe|
|       J. Doe|
|Johnathan Doe|
|      Jon Doe|
|     Jhon Doe|
| Mr. John Doe|
|   Jane Smith|
|     J. Smith|
|Jane A. Smith|
|  Smith, Jane|
|   J.A. Smith|
|    Mr. Smith|
|  Janes Smyth|
|  Jane Smythe|
+-------------+

Cleaned DataFrame:
+-------------+-------------+
|         name| cleaned_name|
+-------------+-------------+
|     John Doe|     john doe|
|     john doe|     john doe|
|       J. Doe|        j doe|
|Johnathan Doe|johnathan doe|
|      Jon Doe|      jon doe|
|     Jhon Doe|     jhon doe|
| Mr. John Doe|  mr john doe|
|   Jane Smith|   jane smith|
|     J. Smith|      j smith|
|Jane A. Smith| jane a smith|
|  Smith, Jane|   smith jane|
|   J.A. Smith|     ja smith|
|    Mr. Smith|     mr smith|
|  Janes Smyth|  janes smyth|
|  Jane Smythe|  jane smythe|
+-------------+-------------+

Tokenized DataFrame:
+-------------+-------------+----------------+
|name         |cl