In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("Spam Detection").getOrCreate()

df = spark.read.csv("SmSSpamCollection", sep="\t", inferSchema=False).toDF("label", "message")
df = df.na.drop()
df = df.filter(df.message != "")


indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df = indexer.fit(df).transform(df)

tokenizer = Tokenizer(inputCol="message", outputCol="words")
df = tokenizer.transform(df)

hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=10000)
df = hashingTF.transform(df)

train, test = df.randomSplit([0.8, 0.2], seed=42)


nb = NaiveBayes(featuresCol="features", labelCol="labelIndex")
model = nb.fit(train)

predictions = model.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

predictions.select("label", "message", "prediction").show(10, truncate=False)

spark.stop()

Accuracy: 97.39%
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|label|message                                                                                                                                                   |prediction|
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|ham  | &lt;DECIMAL&gt; m but its not a common car here so its better to buy from china or asia. Or if i find it less expensive. I.ll holla                      |0.0       |
|ham  | said kiss, kiss, i can't do the sound effects! He is a gorgeous man isn't he! Kind of person who needs a smile to brighten his day!                      |0.0       |
|ham  | what number do u live at? Is it 11?                                                                      