# Spam Detection

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 46 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 57.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=b1015365910138fdfe726ed11a507ac1d53c2a8d4ca5de85bdb043fc3847ba00
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


## Library

In [None]:
# import library
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

## Spark SQL and Load Dataset

In [None]:
#spark SQL
spark = SparkSession.builder.appName('spam').getOrCreate()

#Load dataset
data = spark.read.csv('SMSSpamCollection', inferSchema=True, sep='\t')
data.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [None]:
#column rename
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
data.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



## Cleaned and Prepare the Data

In [None]:
#create length text column
data = data.withColumn('length', length(data['text']))
data.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [None]:
#length text average
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [None]:
data.describe().show()

+-------+-----+--------------------+-----------------+
|summary|class|                text|           length|
+-------+-----+--------------------+-----------------+
|  count| 5574|                5574|             5574|
|   mean| null|               645.0|80.46232508073197|
| stddev| null|                null|59.84156603055758|
|    min|  ham| &lt;#&gt;  in mc...|                2|
|    max| spam|… we r stayin her...|              910|
+-------+-----+--------------------+-----------------+



In [None]:
#train test split dataset
trainData, testData = data.randomSplit([0.8,0.2])

## Feature Transformations

In [None]:
#feature transformation dataset
tokenizer = Tokenizer(inputCol='text', outputCol='tokenText')
wordRemove = StopWordsRemover(inputCol='tokenText', outputCol='stopTokens')
countVec = CountVectorizer(inputCol='stopTokens', outputCol='vecTokens')
idf = IDF(inputCol='vecTokens', outputCol='tfidf')
cleanData = VectorAssembler(inputCols=['tfidf', 'length'], outputCol='features')
classNum = StringIndexer(inputCol='class', outputCol='label')

In [None]:
#pipeline transformation
dataPipe = Pipeline(stages=[tokenizer, wordRemove, countVec, idf, cleanData, classNum])

In [None]:
#fit transformation to traindata
cleaner = dataPipe.fit(trainData)

#transform traindata
trainDataClean = cleaner.transform(trainData)

#transform testdata
testDataClean = cleaner.transform(testData)

In [None]:
testDataClean.show(5)

+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|class|                text|length|           tokenText|          stopTokens|           vecTokens|               tfidf|            features|label|
+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|  ham| &lt;#&gt;  in mc...|    36|[, &lt;#&gt;, , i...|[, &lt;#&gt;, , m...|(11572,[3,7],[2.0...|(11572,[3,7],[6.1...|(11573,[3,7,11572...|  0.0|
|  ham|"Happy valentines...|   147|["happy, valentin...|["happy, valentin...|(11572,[11,167,22...|(11572,[11,167,22...|(11573,[11,167,22...|  0.0|
|  ham|"SYMPTOMS" when U...|   139|["symptoms", when...|["symptoms", u, l...|(11572,[0,5,12,27...|(11572,[0,5,12,27...|(11573,[0,5,12,27...|  0.0|
|  ham|"Wen u miss someo...|   143|["wen, u, miss, s...|["wen, u, miss, s...|(11572,[0,82,223,...|(11572,[0,82,223,...

In [None]:
#slice dataset into label and features
trainDataSelect = trainDataClean.select(['label', 'features'])
testDataSelect = testDataClean.select(['label', 'features'])

## Model

In [None]:
#naive bayes model
nb = NaiveBayes()

# Train and Evaluation

In [None]:
#generate detection model
spamDetector = nb.fit(trainDataSelect)

In [None]:
#predict the test dataset
predictions = spamDetector.transform(testDataSelect)

In [None]:
predictions.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(11573,[3,7,11572...|[-78.672177973340...|[1.0,1.0125833742...|       0.0|
|  0.0|(11573,[11,167,22...|[-504.04238801110...|[1.0,1.2235424464...|       0.0|
|  0.0|(11573,[0,5,12,27...|[-669.11443021709...|[1.0,1.1279746716...|       0.0|
|  0.0|(11573,[0,82,223,...|[-435.57899339983...|[1.0,9.4192432598...|       0.0|
|  0.0|(11573,[0,3,7,8,1...|[-2117.6540435794...|[1.0,7.2327030285...|       0.0|
|  0.0|(11573,[14,126,33...|[-400.10981320850...|[1.0,1.9221419640...|       0.0|
|  0.0|(11573,[68,178,11...|[-80.366992987145...|[1.0,5.4142737576...|       0.0|
|  0.0|(11573,[178,2563,...|[-126.25807646239...|[1.0,3.8473953914...|       0.0|
|  0.0|(11573,[0,2,3,8,2...|[-1306.6092306065...|[1.0,1.1718136659...|       0.0|
|  0.0|(11573,[3

In [None]:
#evaluate model performance
accEval = MulticlassClassificationEvaluator()
aucEval = BinaryClassificationEvaluator(labelCol="label",
                                         rawPredictionCol="probability", 
                                         metricName="areaUnderROC")
aucScore = aucEval.evaluate(predictions)
accuracy = accEval.evaluate(predictions)
print("Accuracy of model at predicting spam was: {}".format(accuracy))
print("AUC of model at predicting spam was: {}".format(aucScore))

Accuracy of model at predicting spam was: 0.9806167400881057
AUC of model at predicting spam was: 0.9824356355255263
