In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.master", "local[*]") \
    .getOrCreate()


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, col, array, concat_ws, lit, monotonically_increasing_id

num_rows = 1000
first_names = ["John", "Jane", "Alice", "Bob", "Charlie", "David", "Emily", "Frank", "Grace", "Henry"]
last_names = ["Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor"]

df = spark.range(num_rows)\
  .withColumn("client_id", monotonically_increasing_id())\
  .withColumn("first_name", array(*[lit(name) for name in first_names]).getItem((rand()*10).cast("int")))\
  .withColumn("last_name", array(*[lit(name) for name in last_names]).getItem((rand()*10).cast("int")))\
  .withColumn("full_name", concat_ws(" ", col("last_name"), col("first_name")))\
  .select(col("client_id"),col("full_name"))

df.show(10)

+---------+--------------+
|client_id|     full_name|
+---------+--------------+
|        0|     Smith Bob|
|        1|   Jones David|
|        2|   Brown Grace|
|        3|   Brown Frank|
|        4|  Miller David|
|        5|Miller Charlie|
|        6|Williams Henry|
|        7|   Brown Grace|
|        8|   Smith Emily|
|        9|   Smith Emily|
+---------+--------------+
only showing top 10 rows





In [13]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.ml.feature import Normalizer, VectorAssembler


# Tokenize the text column
tokenizer = Tokenizer(inputCol="full_name", outputCol="fname_tokenized")
words_df = tokenizer.transform(df)

words_df.show(20)

+---------+--------------+-----------------+
|client_id|     full_name|  fname_tokenized|
+---------+--------------+-----------------+
|        0|     Smith Bob|     [smith, bob]|
|        1|   Jones David|   [jones, david]|
|        2|   Brown Grace|   [brown, grace]|
|        3|   Brown Frank|   [brown, frank]|
|        4|  Miller David|  [miller, david]|
|        5|Miller Charlie|[miller, charlie]|
|        6|Williams Henry|[williams, henry]|
|        7|   Brown Grace|   [brown, grace]|
|        8|   Smith Emily|   [smith, emily]|
|        9|   Smith Emily|   [smith, emily]|
|       10|   Brown Alice|   [brown, alice]|
|       11|  Wilson Grace|  [wilson, grace]|
|       12| Johnson Frank| [johnson, frank]|
|       13|  Miller Henry|  [miller, henry]|
|       14|   Moore Frank|   [moore, frank]|
|       15|Williams Grace|[williams, grace]|
|       16| Jones Charlie| [jones, charlie]|
|       17|   Davis Henry|   [davis, henry]|
|       18| Moore Charlie| [moore, charlie]|
|       19

In [15]:
# Create a CountVectorizer to convert the text data into a sparse matrix of term frequencies
cv = CountVectorizer(inputCol="fname_tokenized", outputCol="fname_features")
cv_model = cv.fit(words_df)
features_df = cv_model.transform(words_df)

features_df.show(20)

+---------+--------------+-----------------+--------------------+
|client_id|     full_name|  fname_tokenized|      fname_features|
+---------+--------------+-----------------+--------------------+
|        0|     Smith Bob|     [smith, bob]|(20,[9,13],[1.0,1...|
|        1|   Jones David|   [jones, david]|(20,[0,8],[1.0,1.0])|
|        2|   Brown Grace|   [brown, grace]|(20,[6,17],[1.0,1...|
|        3|   Brown Frank|   [brown, frank]|(20,[1,6],[1.0,1.0])|
|        4|  Miller David|  [miller, david]|(20,[5,8],[1.0,1.0])|
|        5|Miller Charlie|[miller, charlie]|(20,[4,5],[1.0,1.0])|
|        6|Williams Henry|[williams, henry]|(20,[2,16],[1.0,1...|
|        7|   Brown Grace|   [brown, grace]|(20,[6,17],[1.0,1...|
|        8|   Smith Emily|   [smith, emily]|(20,[13,18],[1.0,...|
|        9|   Smith Emily|   [smith, emily]|(20,[13,18],[1.0,...|
|       10|   Brown Alice|   [brown, alice]|(20,[6,7],[1.0,1.0])|
|       11|  Wilson Grace|  [wilson, grace]|(20,[12,17],[1.0,...|
|       12

In [None]:
# Compute pairwise string similarity
string_sim = StringSimilarity(inputCol="text", referenceCol="text", outputCol="similarity", metricName="jaccard")
similarity_df = string_sim.transform(df)

# Normalize the similarity scores
normalizer = Normalizer(inputCol="similarity", outputCol="norm")
similarity_norm_df = normalizer.transform(similarity_df)

# Assemble the normalized similarity scores into a feature vector
assembler = VectorAssembler(inputCols=["norm"], outputCol="features")
feature_df = assembler.transform(similarity_norm_df)

# Apply k-means clustering to the transformed data
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(feature_df)
predictions = model.transform(feature_df)

# Show the cluster number for each text string
predictions.select("id", "text", "prediction").show()