In [4]:
# Carregando libs e iniciando Session
from pyspark.sql import SparkSession, functions as func
from pyspark.sql.types import *
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import RFormula

spark = SparkSession.builder.getOrCreate()

In [12]:
# Importando dados
churn = spark.read.csv('../arquivos/Churn.csv', inferSchema=True, header=True, sep=';')
churn.show(3)

+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|      0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1|8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8|1596608|            3|        1|             0|       11393157|     1|
+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
only showing top 3 rows



In [27]:
# Transformando os dados
form = RFormula(formula="Exited ~ .", featuresCol="features", labelCol="label", handleInvalid="skip")
churn_t = form.fit(churn).transform(churn)
churn_t.select('features','label').show(3, truncate=False)

+--------------------------------------------------------------+-----+
|features                                                      |label|
+--------------------------------------------------------------+-----+
|[619.0,1.0,0.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,1.0134888E7]      |1.0  |
|[608.0,0.0,0.0,0.0,41.0,1.0,8380786.0,1.0,0.0,1.0,1.1254258E7]|0.0  |
|[502.0,1.0,0.0,0.0,42.0,8.0,1596608.0,3.0,1.0,0.0,1.1393157E7]|1.0  |
+--------------------------------------------------------------+-----+
only showing top 3 rows



In [30]:
# Separando a base em treino e teste
churn_treino, churn_teste = churn_t.randomSplit([0.7, 0.3])
print(churn_treino.count())
print(churn_teste.count())

                                                                                

6955


[Stage 67:>                                                         (0 + 1) / 1]

3045


                                                                                

In [31]:
# Criando o modelo
dt = DecisionTreeClassifier(labelCol='label', featuresCol='features')
modelo = dt.fit(churn_treino)

                                                                                

In [41]:
# Obtendo previsoes
previsao = modelo.transform(churn_teste)
previsao.select('prediction', 'label').show(5)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  1.0|
|       1.0|  1.0|
|       0.0|  1.0|
+----------+-----+
only showing top 5 rows



In [46]:
# Avaliando o modelo considerando falsos positivos e negativos
avaliar = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
print(avaliar.evaluate(previsao))