In [0]:
import urllib
import math
from sklearn.metrics import accuracy_score
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import col, lower, regexp_replace, length, trim, size
from pyspark.sql.types import StringType, ArrayType


In [0]:

dataUrl = 'https://archive.org/download/spam-text-mesg-dataset/SPAM_Text_Mesg_Dataset.csv'
temp_file_path = "/SPAM_Text_Mesg_Dataset.csv"
urllib.request.urlretrieve(dataUrl, temp_file_path)
dbutils.fs.mv(f"file:{temp_file_path}", f"dbfs:{temp_file_path}")


Out[191]: True

In [0]:
display(dataset)


Category,Message
real,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
real,Ok lar... Joking wif u oni...
spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
real,U dun say so early hor... U c already then say...
real,"Nah I don't think he goes to usf, he lives around here though"
spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
real,Even my brother is not like to speak with me. They treat me like aids patent.
real,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


In [0]:
# removing null data 
dataset = dataset.filter("Category is NOT NULL and Message is NOT NULL")
display(dataset)

Category,Message
real,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
real,Ok lar... Joking wif u oni...
spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
real,U dun say so early hor... U c already then say...
real,"Nah I don't think he goes to usf, he lives around here though"
spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
real,Even my brother is not like to speak with me. They treat me like aids patent.
real,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


In [0]:
# pre-processing 

#removing punctuation
cleanText = dataset.select('Category' , (lower(regexp_replace('Message', "[\[\]#./\"'_—=`<>@&!-:;|?*“”\t’‘]", "")).alias('Message')))
display(cleanText)


Category,Message
real,go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
real,ok lar joking wif u oni
spam,free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry questionstd txt ratetcs apply overs
real,u dun say so early hor u c already then say
real,nah i dont think he goes to usf he lives around here though
spam,freemsg hey there darling its been weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send £ to rcv
real,even my brother is not like to speak with me they treat me like aids patent
real,as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press to copy your friends callertune
spam,winner as a valued network customer you have been selected to receivea £ prize reward to claim call claim code kl valid hours only
spam,had your mobile months or more u r entitled to update to the latest colour mobiles with camera for free call the mobile update co free on


In [0]:
#tokenizing 
tokenizer = Tokenizer(inputCol='Message' , outputCol='MessageToken')
descTokenDF = tokenizer.transform(cleanText).select('Category' , 'MessageToken')
display(descTokenDF)

Category,MessageToken
real,"List(go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, cine, there, got, amore, wat)"
real,"List(ok, lar, joking, wif, u, oni)"
spam,"List(free, entry, in, , a, wkly, comp, to, win, fa, cup, final, tkts, st, may, , text, fa, to, , to, receive, entry, questionstd, txt, ratetcs, apply, overs)"
real,"List(u, dun, say, so, early, hor, u, c, already, then, say)"
real,"List(nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though)"
spam,"List(freemsg, hey, there, darling, its, been, , weeks, now, and, no, word, back, id, like, some, fun, you, up, for, it, still, tb, ok, xxx, std, chgs, to, send, £, to, rcv)"
real,"List(even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent)"
real,"List(as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as, your, callertune, for, all, callers, press, , to, copy, your, friends, callertune)"
spam,"List(winner, as, a, valued, network, customer, you, have, been, selected, to, receivea, £, prize, reward, to, claim, call, , claim, code, kl, valid, , hours, only)"
spam,"List(had, your, mobile, , months, or, more, u, r, entitled, to, update, to, the, latest, colour, mobiles, with, camera, for, free, call, the, mobile, update, co, free, on)"


In [0]:

#removing stopwords
stopWordRemover = StopWordsRemover(inputCol='MessageToken' , outputCol='Message')
textNoStopWords = stopWordRemover.transform(descTokenDF).select('Category' , 'Message')
display(textNoStopWords)


Category,Message
real,"List(go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat)"
real,"List(ok, lar, joking, wif, u, oni)"
spam,"List(free, entry, , wkly, comp, win, fa, cup, final, tkts, st, may, , text, fa, , receive, entry, questionstd, txt, ratetcs, apply, overs)"
real,"List(u, dun, say, early, hor, u, c, already, say)"
real,"List(nah, dont, think, goes, usf, lives, around, though)"
spam,"List(freemsg, hey, darling, , weeks, word, back, id, like, fun, still, tb, ok, xxx, std, chgs, send, £, rcv)"
real,"List(even, brother, like, speak, treat, like, aids, patent)"
real,"List(per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, press, , copy, friends, callertune)"
spam,"List(winner, valued, network, customer, selected, receivea, £, prize, reward, claim, call, , claim, code, kl, valid, , hours)"
spam,"List(mobile, , months, u, r, entitled, update, latest, colour, mobiles, camera, free, call, mobile, update, co, free)"


In [0]:
# Only keeping words with length > 2.

cleanDescriptionUdf = udf(lambda row: [x for x in row if len(x) > 2], ArrayType(StringType()))
cleanDescData = textNoStopWords.select('Category' , cleanDescriptionUdf(col('Message')).alias('Message') )

preparedData = cleanDescData.filter(size(col("Message")) > 2)

display(preparedData)

Category,Message
real,"List(jurong, point, crazy, available, bugis, great, world, buffet, cine, got, amore, wat)"
real,"List(lar, joking, wif, oni)"
spam,"List(free, entry, wkly, comp, win, cup, final, tkts, may, text, receive, entry, questionstd, txt, ratetcs, apply, overs)"
real,"List(dun, say, early, hor, already, say)"
real,"List(nah, dont, think, goes, usf, lives, around, though)"
spam,"List(freemsg, hey, darling, weeks, word, back, like, fun, still, xxx, std, chgs, send, rcv)"
real,"List(even, brother, like, speak, treat, like, aids, patent)"
real,"List(per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, press, copy, friends, callertune)"
spam,"List(winner, valued, network, customer, selected, receivea, prize, reward, claim, call, claim, code, valid, hours)"
spam,"List(mobile, months, entitled, update, latest, colour, mobiles, camera, free, call, mobile, update, free)"


In [0]:
# splitting the dataset into training data and test data

train, test = preparedData.randomSplit([0.8, 0.2])
display(train)

Category,Message
real,"List(aah, bless, hows, arm)"
real,"List(aah, cuddle, lush, need, lots, tea, soup, kind, fumbling)"
real,"List(abeg, make, profit, start, using, get, sponsors, next, event)"
real,"List(abt, making, pics, bigger)"
real,"List(accidentally, brought, home, box)"
real,"List(accidentally, deleted, message, resend, please)"
real,"List(account, refilled, successfully, inr, ltdecimalgt, keralacircle, prepaid, account, balance, ltdecimalgt, transaction, ltgt)"
real,"List(actor, work, work, evening, sleep, late, since, unemployed, moment, always, sleep, late, youre, unemployed, every, day, saturday)"
real,"List(actual, exam, harder, nbme)"
real,"List(actually, decided, hungry, havent, left, yet)"


In [0]:
# counting the no. of words in training dataset
trainDataWordCount = train.count()
trainDataWordCount

Out[199]: 3945

In [0]:
# calculating classifier probability 
classProbabilities = train.groupBy('Category').count().rdd.mapValues(lambda x: (x, x / trainDataWordCount)).collectAsMap()
classProbabilities

Out[200]: {'spam': (585, 0.1482889733840304), 'real': (3360, 0.8517110266159695)}

In [0]:
#calculating unique words 
wordsFeature = train.select('Message').rdd.flatMap(lambda x: x).map(lambda x: x[0]).distinct().count()
wordsFeature

Out[201]: 1348

In [0]:
# word count in each class
trainDataWords = train.rdd.flatMapValues(lambda x: x).map(lambda x: ((x[0], x[1]), 1)).reduceByKey(lambda x,y : x+y).collectAsMap()

In [0]:
# no. of distinct classifier
distinctClass = train.rdd.map(lambda x: x['Category']).distinct().collect()
distinctClass

Out[203]: ['real', 'spam']

In [0]:
# computing probability 

def computeProbabilities(words):
    condProb = {}
    for k in distinctClass:
        condProb[k] = math.log(classProbabilities[k][1])
        classfiers = classProbabilities[k][0]
        for w in words:
            wordCount = trainDataWords.get((k, w), 0)
            probScore = (wordCount + 1) / float(classfiers + wordsFeature)
            
            likelihood = math.log(probScore)
            condProb[k] += likelihood
    return max(condProb, key=condProb.get)



In [0]:
prediction = test.rdd.map(lambda x: computeProbabilities(x['Message'])).collect()


In [0]:
accurate = test.rdd.map(lambda x: x['Category']).collect()


In [0]:
print(f"Model Accuracy: {100 * accuracy_score(prediction, accurate)} %")


Model Accuracy: 96.55172413793103 %


In [0]:
for c, (wordCount, classProb) in classProbabilities.items():
  print(f"Prior probability for '{c}': {classProb}")

Prior probability for 'spam': 0.1482889733840304
Prior probability for 'real': 0.8517110266159695
