### André Campos da Silva


### 08 de Janeiro, 2021

### Projeto -  Spam Classifier


Desenvolver um algoritmo em pyspark que seja capaz de determinar se uma mensagem é spam ou não, baseados em dados históricos. 

## Carregando pacotes

In [172]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF
from pyspark.ml.feature import HashingTF, Tokenizer,StopWordsRemover
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import udf, col, lower, regexp_replace,trim
from nltk.stem.snowball import SnowballStemmer


## Carregando os Dados

In [39]:
RDDdataset = sc.textFile("Dados/sms_spam.csv")

In [40]:
type(dataset)

pyspark.rdd.PipelinedRDD

In [41]:
RDDdataset.take(10)

['type,text',
 'ham,Hope you are having a good week. Just checking in',
 'ham,K..give back my thanks.',
 'ham,Am also doing in cbe only. But have to pay.',
 'spam,"complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out! Box434SK38WP150PPM18+"',
 'spam,okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm',
 'ham,Aiya we discuss later lar... Pick u up at 4 is it?',
 'ham,Are you this much buzy',
 'ham,Please ask mummy to call father',
 'spam,Marvel Mobile Play the official Ultimate Spider-man game (£4.50) on ur mobile right now. Text SPIDER to 83338 for the game & we ll send u a FREE 8Ball wallpaper']

## Tratamento dos dados

In [42]:
# Removo o cabeçalho 
cabecalho = RDDdataset.take(1)[0]
RDDdataset = RDDdataset.filter(lambda line: line !=cabecalho)

In [43]:
RDDdataset.take(10)

['ham,Hope you are having a good week. Just checking in',
 'ham,K..give back my thanks.',
 'ham,Am also doing in cbe only. But have to pay.',
 'spam,"complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out! Box434SK38WP150PPM18+"',
 'spam,okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm',
 'ham,Aiya we discuss later lar... Pick u up at 4 is it?',
 'ham,Are you this much buzy',
 'ham,Please ask mummy to call father',
 'spam,Marvel Mobile Play the official Ultimate Spider-man game (£4.50) on ur mobile right now. Text SPIDER to 83338 for the game & we ll send u a FREE 8Ball wallpaper',
 'ham,"fyi I\'m at usf now, swing by the room whenever"']

In [62]:
# formula para dividir o RDD em colunas e ja converte a label de caracterer para int. 
def transformlabel(RDD):
    tolist = RDD.split(",")
    label = 0.0 if tolist[0] == "ham" else 1.0
    return [label, tolist[1]]

In [63]:
# Aplico a formula para o RDD 
RDDdataset2 = RDDdataset.map(transformlabel)
RDDdataset2.take(10)

[[0.0, 'Hope you are having a good week. Just checking in'],
 [0.0, 'K..give back my thanks.'],
 [0.0, 'Am also doing in cbe only. But have to pay.'],
 [1.0, '"complimentary 4 STAR Ibiza Holiday or £10'],
 [1.0,
  'okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm'],
 [0.0, 'Aiya we discuss later lar... Pick u up at 4 is it?'],
 [0.0, 'Are you this much buzy'],
 [0.0, 'Please ask mummy to call father'],
 [1.0,
  'Marvel Mobile Play the official Ultimate Spider-man game (£4.50) on ur mobile right now. Text SPIDER to 83338 for the game & we ll send u a FREE 8Ball wallpaper'],
 [0.0, '"fyi I\'m at usf now']]

In [190]:
# Spark Session - Seção para usar a função de dataframe do spark
spSession = SparkSession.builder.master("local").appName("DSA-SparkMLLib").getOrCreate()

In [191]:
# Crio um dataframe do spark com os dados tratados acuma. 
df_spam = spSession.createDataFrame(RDDdataset2, ["label", "message"])


In [192]:
# Imprimo as primeiras 10 linhas do dataframe. 
df_spam.select("label", "message").show(20)

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  0.0|Hope you are havi...|
|  0.0|K..give back my t...|
|  0.0|Am also doing in ...|
|  1.0|"complimentary 4 ...|
|  1.0|okmail: Dear Dave...|
|  0.0|Aiya we discuss l...|
|  0.0|Are you this much...|
|  0.0|Please ask mummy ...|
|  1.0|Marvel Mobile Pla...|
|  0.0| "fyi I'm at usf now|
|  0.0|"Sure thing big m...|
|  0.0|   I anything lor...|
|  0.0|    "By march ending|
|  0.0|           "Hmm well|
|  0.0|K I'll be sure to...|
|  0.0|Ha ha cool cool c...|
|  0.0|Darren was saying...|
|  0.0|He dint tell anyt...|
|  0.0|Up to u... u wan ...|
|  1.0|"U can WIN £100 o...|
+-----+--------------------+
only showing top 20 rows



In [193]:
# Converto todas as mesangem para minúsculo 
df_spam = df_spam.select('label', lower(col('message')).alias('message'))

# Tiro os espaços
df_spam = df_spam.select('label', trim(col('message')).alias('message'))

# Limpeza a mesangem, tirando pontuações, números e etc.
df_spam = df_spam.select('label',(regexp_replace('message','[^a-zA-Z\\s]', '')).alias('message'))

In [194]:
df_spam.select('*').show(20)

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  0.0|hope you are havi...|
|  0.0|kgive back my thanks|
|  0.0|am also doing in ...|
|  1.0|complimentary  st...|
|  1.0|okmail dear dave ...|
|  0.0|aiya we discuss l...|
|  0.0|are you this much...|
|  0.0|please ask mummy ...|
|  1.0|marvel mobile pla...|
|  0.0|   fyi im at usf now|
|  0.0|sure thing big ma...|
|  0.0|      i anything lor|
|  0.0|     by march ending|
|  0.0|            hmm well|
|  0.0|k ill be sure to ...|
|  0.0|ha ha cool cool c...|
|  0.0|darren was saying...|
|  0.0|he dint tell anyt...|
|  0.0|up to u u wan com...|
|  1.0|u can win  of mus...|
+-----+--------------------+
only showing top 20 rows



In [185]:
# Tokenização da mesangem. 
tokenizer  = Tokenizer(inputCol='message', outputCol='messages_token')
df_spam_token = tokenizer.transform(df_spam).select('label', 'messages_token')

In [151]:
# Com a mensagem tokenizadas, realizamos a extração dos stopwords
stopwords_remove = StopWordsRemover(inputCol ='messages_token', outputCol = 'messages')
df_spam_none_stopwords = stopwords_remove.transform(df_spam_token).select('label', 'messages')

In [174]:
# Limpeza das Words stemmings
#stem = SnowballStemmer(language='english')
#stem_udf = udf(lambda tokens: [stem.stem(token) for token in tokens], ArrayType(StringType()))
#df_stem = stem_udf.transform(df_spam_none_stopwords)

NameError: name 'ArrayType' is not defined

In [175]:
df_spam = df_spam_none_stopwords

In [179]:
df_spam.select('messages').show(20)

+--------------------+
|            messages|
+--------------------+
|[hope, good, week...|
|[kgive, back, tha...|
|    [also, cbe, pay]|
|[complimentary, ,...|
|[okmail, dear, da...|
|[aiya, discuss, l...|
|        [much, buzy]|
|[please, ask, mum...|
|[marvel, mobile, ...|
|      [fyi, im, usf]|
|[sure, thing, big...|
|     [anything, lor]|
|     [march, ending]|
|         [hmm, well]|
|[k, ill, sure, ge...|
|[ha, ha, cool, co...|
|[darren, saying, ...|
|[dint, tell, anyt...|
|[u, u, wan, come,...|
|[u, win, , music,...|
+--------------------+
only showing top 20 rows



## Split dos dados treino e teste.

In [195]:
# Dados de Treino e de Teste
(data_train, data_test) = df_spam.randomSplit([0.7, 0.3])

In [196]:
data_train.select('*').show(5)

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  0.0|                    |
|  0.0|                    |
|  0.0|                    |
|  0.0|                    |
|  0.0|  great loxahatch...|
+-----+--------------------+
only showing top 5 rows

