# **SPAM CLASSIFIER**

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
#create SparkSession instance
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[5]').config('spark.driver.memory','16g').appName('sentanaly').getOrCreate()

23/02/17 08:50:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


#### Loading Dataset

In [3]:
# Load data and rename column
df = spark.read.option("header", "false") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("SMSSpamCollection.txt") \
    .withColumnRenamed("_c0", "label_string") \
    .withColumnRenamed("_c1", "sms")

df.limit(10).show()

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

+------------+--------------------+
|label_string|                 sms|
+------------+--------------------+
|         ham|Go until jurong p...|
|         ham|Ok lar... Joking ...|
|        spam|Free entry in 2 a...|
|         ham|U dun say so earl...|
|         ham|Nah I don't think...|
|        spam|FreeMsg Hey there...|
|         ham|Even my brother i...|
|         ham|As per your reque...|
|        spam|WINNER!! As a val...|
|        spam|Had your mobile 1...|
+------------+--------------------+



In [4]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes

#### Pipeline Stages

In [5]:
stages = []

In [6]:
# 1. clean data and tokenize sentences using RegexTokenizer
regexTokenizer = RegexTokenizer(inputCol="sms", outputCol="tokens", pattern="\\W+")
stages += [regexTokenizer]

In [7]:
# 2. CountVectorize the data
cv = CountVectorizer(inputCol="tokens", outputCol="token_features", minDF=2.0)#, vocabSize=3, minDF=2.0
stages += [cv]

In [8]:
# 3. Convert the labels to numerical values using binariser
indexer = StringIndexer(inputCol="label_string", outputCol="label")
stages += [indexer]

In [9]:
# 4. Vectorise features using vectorassembler
vecAssembler = VectorAssembler(inputCols=['token_features'], outputCol="features")
stages += [vecAssembler]

In [10]:
[print('\n', stage) for stage in stages]


 RegexTokenizer_3cec82b0c80a

 CountVectorizer_5b09b438d5bb

 StringIndexer_a4246c6045f3

 VectorAssembler_9901d7d71431


[None, None, None, None]

#### Fit Pipeline

In [11]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
data = pipeline.fit(df).transform(df)

                                                                                

#### Split dataset into train and test

In [12]:
train, test = data.randomSplit([0.7, 0.3], seed = 2018)

In [13]:
train.show()

[Stage 9:>                                                          (0 + 1) / 1]

+------------+--------------------+--------------------+--------------------+-----+--------------------+
|label_string|                 sms|              tokens|      token_features|label|            features|
+------------+--------------------+--------------------+--------------------+-----+--------------------+
|         ham| &lt;#&gt;  mins ...|[lt, gt, mins, bu...|(4249,[0,1,26,43,...|  0.0|(4249,[0,1,26,43,...|
|         ham| &lt;DECIMAL&gt; ...|[lt, decimal, gt,...|(4249,[0,1,3,11,2...|  0.0|(4249,[0,1,3,11,2...|
|         ham| and  picking the...|[and, picking, th...|(4249,[6,46,50,17...|  0.0|(4249,[6,46,50,17...|
|         ham| came to look at ...|[came, to, look, ...|(4249,[1,4,7,8,32...|  0.0|(4249,[1,4,7,8,32...|
|         ham| gonna let me kno...|[gonna, let, me, ...|(4249,[8,9,16,50,...|  0.0|(4249,[8,9,16,50,...|
|         ham| said kiss, kiss,...|[said, kiss, kiss...|(4249,[0,1,3,4,8,...|  0.0|(4249,[0,1,3,4,8,...|
|         ham| says that he's q...|[says, that, he, ...

                                                                                

#### Random Forest Classifier

In [14]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)

                                                                                

#### Prediction

In [15]:
predictionsrf = rfModel.transform(test)
# Select results to view
predictionsrf.limit(10).select("label", "prediction", "probability").show(truncate=False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0.0  |0.0       |[0.916473728664833,0.08352627133516687] |
|0.0  |0.0       |[0.916473728664833,0.08352627133516687] |
|0.0  |0.0       |[0.8552216406698138,0.14477835933018618]|
|0.0  |0.0       |[0.8961477594764048,0.1038522405235951] |
|0.0  |0.0       |[0.916473728664833,0.08352627133516687] |
|0.0  |0.0       |[0.916473728664833,0.08352627133516687] |
|0.0  |0.0       |[0.916473728664833,0.08352627133516687] |
|0.0  |0.0       |[0.8806433524555836,0.11935664754441647]|
|0.0  |0.0       |[0.9218105736883985,0.0781894263116016] |
|0.0  |0.0       |[0.916473728664833,0.08352627133516687] |
+-----+----------+----------------------------------------+



#### Model Evaluation

In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluatorrf = BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="prediction")
accuracyrf = evaluatorrf.evaluate(predictionsrf)
print ("Test Area Under ROC: ", accuracyrf)

Test Area Under ROC:  0.5140845070422535


#### Naive Bayes Implementation

In [17]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(train)

                                                                                

#### Prediction

In [18]:
predictions = model.transform(test)
# Select results to view
predictions.limit(10).select("label", "prediction", "probability").show(truncate=False)

+-----+----------+------------------------------------------+
|label|prediction|probability                               |
+-----+----------+------------------------------------------+
|0.0  |0.0       |[0.9999996176179956,3.823820044882337E-7] |
|0.0  |0.0       |[0.9972054995602091,0.002794500439790882] |
|0.0  |0.0       |[0.9999999999978098,2.190326444063966E-12]|
|0.0  |0.0       |[0.9999999999999538,4.607804951342392E-14]|
|0.0  |0.0       |[0.999999999880886,1.1911406870203127E-10]|
|0.0  |0.0       |[0.999688852925206,3.1114707479388615E-4] |
|0.0  |0.0       |[0.9999999098737272,9.012627286140461E-8] |
|0.0  |0.0       |[0.9999950690131734,4.930986826665776E-6] |
|0.0  |0.0       |[0.9999795625725587,2.043742744135259E-5] |
|0.0  |0.0       |[0.9999063364041348,9.366359586510845E-5] |
+-----+----------+------------------------------------------+



#### Model Evaluation

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print ("Test Area Under ROC: ", accuracy)

Test Area Under ROC:  0.972052252090383


In [20]:
spark.stop()