# Spam or Ham message classifier
<img src="https://blog.codecentric.de/files/2016/06/ham-vs-spam.png">

In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
sc = SparkContext('local','example')
sqlContext = SQLContext(sc)

In [2]:
rdd_wheader = sc.textFile("data/spam.csv")

In [3]:
rdd_wheader.take(5)

['v1,v2,,,',
 'ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,',
 'ham,Ok lar... Joking wif u oni...,,,',
 "spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,",
 'ham,U dun say so early hor... U c already then say...,,,']

In [4]:
# removing the head
header = rdd_wheader.first()

In [5]:
rdd = rdd_wheader.filter(lambda row: row!=header)
rdd.take(5)

['ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,',
 'ham,Ok lar... Joking wif u oni...,,,',
 "spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,",
 'ham,U dun say so early hor... U c already then say...,,,',
 'ham,"Nah I don\'t think he goes to usf, he lives around here though",,,']

In [6]:
separated = rdd.map(lambda row: [row.split(",")[0]]+[''.join(row.split(",")[1:])])
separated.take(5)

[['ham',
  '"Go until jurong point crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."'],
 ['ham', 'Ok lar... Joking wif u oni...'],
 ['spam',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"],
 ['ham', 'U dun say so early hor... U c already then say...'],
 ['ham', '"Nah I don\'t think he goes to usf he lives around here though"']]

In [7]:
rdd_with_ids = separated.zipWithIndex()
rdd_with_ids.take(5)

[(['ham',
   '"Go until jurong point crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."'],
  0),
 (['ham', 'Ok lar... Joking wif u oni...'], 1),
 (['spam',
   "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"],
  2),
 (['ham', 'U dun say so early hor... U c already then say...'], 3),
 (['ham', '"Nah I don\'t think he goes to usf he lives around here though"'],
  4)]

In [8]:
import re
labels = lambda label: 1 if "ham" in label else 0
cleaned_rdd = rdd_with_ids.map(lambda row: (row[1],labels(row[0][0]), " ".join(re.findall("[a-zA-Z]+", row[0][1])).lower()))
cleaned_rdd.take(5)

[(0,
  1,
  'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'),
 (1, 1, 'ok lar joking wif u oni'),
 (2,
  0,
  'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s'),
 (3, 1, 'u dun say so early hor u c already then say'),
 (4, 1, 'nah i don t think he goes to usf he lives around here though')]

In [9]:
df = cleaned_rdd.toDF(["id","label","message"])
df.show(5)

+---+-----+--------------------+
| id|label|             message|
+---+-----+--------------------+
|  0|    1|go until jurong p...|
|  1|    1|ok lar joking wif...|
|  2|    0|free entry in a w...|
|  3|    1|u dun say so earl...|
|  4|    1|nah i don t think...|
+---+-----+--------------------+
only showing top 5 rows



In [10]:
df.count()

5574

In [11]:
stopwords = open("data/stopwords_en.txt","r").read().split("\n")
stopwords[:10]

['a',
 "a's",
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after']

# Train - Validation - Test
<img src="https://blog.codecentric.de/files/2016/06/train-vs-test-768x363.png">

In [12]:
train, test = df.randomSplit([0.9, 0.1], seed=48)

# Cross Validation
<img src="https://memegenerator.net/img/instances/65014146.jpg" width=400>
<img src="https://i.stack.imgur.com/1fXzJ.png">
# Pipeline
<img src="http://www.bbc.co.uk/staticarchive/6a2edaa3e3a9107131e33af1a7a2707860c32933.jpg" width=500>
<img src="https://databricks.com/wp-content/uploads/2015/01/pipeline-1.png">

In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, StopWordsRemover
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Defining the pipeline
tokenizer = Tokenizer(inputCol='message', outputCol='Words')
removeStopWords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='filtered_words', stopWords=stopwords)
hashingTF = HashingTF(inputCol=removeStopWords.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, removeStopWords, hashingTF, lr])
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()
    
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=5)

In [14]:
import time
start = time.time()
model = crossval.fit(train)
print("Took:", time.time()-start)

Took: 78.88309955596924


In [15]:
prediction = model.transform(test)

In [16]:
prediction.select(["label","prediction"]).show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|    1|       1.0|
|    0|       0.0|
|    1|       1.0|
|    1|       1.0|
|    1|       1.0|
+-----+----------+
only showing top 5 rows



### Honorable mention:
https://blog.codecentric.de/en/2016/06/spam-classification-using-sparks-dataframes-ml-zeppelin-part-1/

# Evaluation
<img src="metrics.PNG" width=800>

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

## Area under ROC curve
Receiver Operating Characteristic
<img src="roc.png" width=600>

# Area under PR curve
<img src="PR.png" width=600>

In [18]:
accuracy = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label", metricName="accuracy")
f1 = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label", metricName="f1")
rmse = RegressionEvaluator(predictionCol="prediction",labelCol="label",metricName="rmse")
roc = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label", metricName="areaUnderROC")
prc = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label", metricName="areaUnderPR")

In [19]:
print("accuracy:", accuracy.evaluate(prediction))
print("f1:", f1.evaluate(prediction))
print("RMSE:", rmse.evaluate(prediction))
print("Area Under ROC Curve:", roc.evaluate(prediction))
print("Area Under PR Curve:", prc.evaluate(prediction))

accuracy: 0.962432915921288
f1: 0.960079062261531
RMSE: 0.19382230026163652
Area Under ROC Curve: 0.8719512195121951
Area Under PR Curve: 0.9578313253012049


<img src="https://img00.deviantart.net/0248/i/2013/295/d/8/that_s_all_folks__by_surrimugge-d6rfav1.png">