##### Fernando Amaral
##### LogisticRegression

In [None]:
# Logist Regression Example

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("logistic").getOrCreate()

In [2]:
churn = spark.read.csv("Churn.csv", header=True, inferSchema=True, sep=";")
churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [3]:
# we need a column with the features vectorized
from pyspark.ml.feature import RFormula
Rformula = RFormula(formula="Exited ~ .", featuresCol="independente", labelCol="dependente")
churnrf = Rformula.fit(churn).transform(churn)
churnrf.select("independente","dependente").show(5, truncate=False)

+--------------------------------------------------------------+----------+
|independente                                                  |dependente|
+--------------------------------------------------------------+----------+
|[619.0,1.0,0.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,1.0134888E7]      |1.0       |
|[608.0,0.0,0.0,0.0,41.0,1.0,8380786.0,1.0,0.0,1.0,1.1254258E7]|0.0       |
|[502.0,1.0,0.0,0.0,42.0,8.0,1596608.0,3.0,1.0,0.0,1.1393157E7]|1.0       |
|(11,[0,1,4,5,7,10],[699.0,1.0,39.0,1.0,2.0,9382663.0])        |0.0       |
|[850.0,0.0,0.0,0.0,43.0,2.0,1.2551082E7,1.0,1.0,1.0,790841.0] |0.0       |
+--------------------------------------------------------------+----------+
only showing top 5 rows



In [6]:
# train test split 
churnTreino, churnTeste = churnrf.randomSplit([0.8,0.2])
print(churnTreino.count())
print(churnTeste.count())

7907
2093


In [7]:
# model fit
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(featuresCol="independente", labelCol="dependente", maxIter=100, regParam=0.08)
modelo = logistic.fit(churnTreino)

In [8]:
# model summary

resumo = modelo.summary

acuracia = resumo.accuracy
precisao = resumo.weightedPrecision
recall = resumo.weightedRecall
auc = resumo.areaUnderROC

print("Acurácia: ", acuracia, "\nPrecisão: ", precisao, "\nRecall: ", recall, "\nAUC ", auc)

Acurácia:  0.8070064499810294 
Precisão:  0.785845409667151 
Recall:  0.8070064499810294 
AUC  0.774166228267313


In [9]:
# predict
previsao = modelo.transform(churnTeste)
previsao.select("dependente","prediction","probability","rawPrediction").show(5,truncate=False)

+----------+----------+----------------------------------------+----------------------------------------+
|dependente|prediction|probability                             |rawPrediction                           |
+----------+----------+----------------------------------------+----------------------------------------+
|1.0       |0.0       |[0.8146833794235251,0.1853166205764749] |[1.4807337214829444,-1.4807337214829444]|
|1.0       |0.0       |[0.8217900424608159,0.17820995753918412]|[1.5285225477109936,-1.5285225477109936]|
|0.0       |0.0       |[0.8563058906626052,0.1436941093373948] |[1.7849408617488183,-1.7849408617488183]|
|0.0       |0.0       |[0.7439320018515045,0.25606799814849546]|[1.0665066083184311,-1.0665066083184311]|
|0.0       |0.0       |[0.9027794076014881,0.09722059239851188]|[2.228495690223417,-2.228495690223417]  |
+----------+----------+----------------------------------------+----------------------------------------+
only showing top 5 rows



In [10]:
# evaluate
from pyspark.ml.evaluation import BinaryClassificationEvaluator
avaliar = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="dependente", metricName="areaUnderROC")
areaUnderRoc = avaliar.evaluate(previsao)
print(areaUnderRoc)


0.7397899106330232
