In [80]:
!pip install pyspark

In [81]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NLP").getOrCreate()

In [82]:
data = spark.read.csv("/kaggle/input/pyspark-ml-nlp/SMSSpamCollection",
                      inferSchema = True, sep = "\t")

data.show(5, truncate = False)

In [83]:
data = data.withColumnRenamed(existing = "_c0", new = "class")\
           .withColumnRenamed(existing = "_c1", new = "text")

data.show(3, truncate = False)

In [84]:
from pyspark.sql.functions import length

data = data.withColumn(colName = "length", col = length(data["text"]))

data.show()

In [85]:
data.groupby("class").mean().show()

In [86]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.feature import StringIndexer

from pyspark.ml.feature import StandardScaler

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [87]:
tokenizer = Tokenizer(inputCol = "text",
                      outputCol = "token_text")

remover = StopWordsRemover(inputCol = "token_text",
                           outputCol = "stop_tokens")

cv = CountVectorizer(inputCol = "stop_tokens",
                            outputCol = "count_vec")

idf = IDF(inputCol = "count_vec",
          outputCol = "tf_idf")

class_indexer = StringIndexer(inputCol = "class",
                              outputCol = "label")

assembler = VectorAssembler(inputCols = ["tf_idf", "length"],
                            outputCol = "features")

scaler = StandardScaler(inputCol = "features",
                        outputCol = "scaled_features",
                        withStd = True,
                        withMean = False)

### NaiveBayes

In [88]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(featuresCol = "scaled_features",
                labelCol = "label",
                predictionCol = "prediction")

### Pipeline

In [89]:
from pyspark.ml import Pipeline

data_pipeline = Pipeline(stages = [class_indexer, tokenizer, remover, cv, idf, assembler, scaler])

df = data_pipeline.fit(data).transform(data)

df.show()

In [90]:
df = df.select("label", "scaled_features")
df.show()

In [91]:
train, test = df.randomSplit(weights = [0.7, 0.3], seed = 42)

In [92]:
model = nb.fit(train)

y_hat = model.transform(test)

y_hat.show()

In [93]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label",
                                              metricName = "accuracy")
accuracy = evaluator.evaluate(y_hat)

accuracy

In [None]:
################################################################################################################################