# Spark MLLib - Classificação - Naive Bayes

## Classificação de Spam

In [2]:
# Imports 
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
# Spark Session - usado quando se trabalha com Dataframes no Spark
spSession = SparkSession.builder.master("local").appName("DSA-SparkMLLib").getOrCreate()

In [4]:
# Carregando os dados em um RDD
spamRDD = sc.textFile("data/SMSSpamCollection.csv", 2)

In [5]:
spamRDD.cache()

data/SMSSpamCollection.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [27]:
spamRDD.take(10)

['ham,Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...,,,,,,,,,',
 'ham,Ok lar... Joking wif u oni...,,,,,,,,,,',
 'ham,U dun say so early hor... U c already then say...,,,,,,,,,,',
 "ham,Nah I don't think he goes to usf, he lives around here though,,,,,,,,,",
 'ham,Even my brother is not like to speak with me. They treat me like aids patent.,,,,,,,,,,',
 "ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune,,,,,,,,,,",
 "ham,I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.,,,,,,,,,",
 "ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.,,,,,,,,,,",
 'ham,I HAVE A DATE ON SUNDAY WITH WILL!!,,,,,,,,,,',
 "ham

## Pré-Processamento dos Dados

In [8]:
def TransformToVector(inputStr):
    attList = inputStr.split(",")
    smsType = 0.0 if attList[0] == "ham" else 1.0
    return [smsType, attList[1]]

In [9]:
spamRDD2 = spamRDD.map(TransformToVector)
spamDF = spSession.createDataFrame(spamRDD2, ["label","message"])
spamDF.cache()
spamDF.select("label", "message").show()

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  0.0|Go until jurong p...|
|  0.0|Ok lar... Joking ...|
|  0.0|U dun say so earl...|
|  0.0|Nah I don't think...|
|  0.0|Even my brother i...|
|  0.0|As per your reque...|
|  0.0|I'm gonna be home...|
|  0.0|I've been searchi...|
|  0.0|I HAVE A DATE ON ...|
|  0.0|Oh k...i'm watchi...|
|  0.0|Eh u remember how...|
|  0.0|Fine if thats th...|
|  0.0|Is that seriously...|
|  0.0|I‘m going to try ...|
|  0.0|So ü pay first la...|
|  0.0|Aft i finish my l...|
|  0.0|Ffffffffff. Alrig...|
|  0.0|Just forced mysel...|
|  0.0|Lol your always s...|
|  0.0|Did you catch the...|
+-----+--------------------+
only showing top 20 rows



## Machine Learning

In [10]:
# Dados de Treino e Teste
(dados_treino, dados_teste) = spamDF.randomSplit([0.7,0.3])

In [11]:
dados_treino.count()

700

In [12]:
dados_teste.count()

300

In [16]:
# Divisão em palavras e aplicação do TF-IDF
tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol= tokenizer.getOutputCol(), outputCol="tempfeatures")
idf = IDF(inputCol= hashingTF.getOutputCol(), outputCol="features")
nbClassifier = NaiveBayes()

In [17]:
# Criand um Pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nbClassifier])

In [18]:
# Crinado e treinando o modelo como o Pipeline
modelo = pipeline.fit(dados_treino)

In [21]:
# Previsão nos dados de teste
previsoes = modelo.transform(dados_teste)
previsoes.select("prediction", "label").toPandas().head(10)

Unnamed: 0,prediction,label
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,1.0,0.0
4,1.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [22]:
# Avaliando a acurácia
avaliador = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
avaliador.evaluate(previsoes)

0.91

In [26]:
# Resumindo as previsões - Confusion Matrix
previsoes.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  149|
|  0.0|       1.0|   21|
|  1.0|       0.0|    6|
|  0.0|       0.0|  124|
+-----+----------+-----+

