# Regressão Logística

- output de previsão binário: classificação de 0 ou 1 (podendo mostrar a probabilidade associada)
- n variáveis dependentes contínuas ou discretas

# Hiper Parâmetros:

- link: função de link utilizada (identity, log, inverse, logit, probit, gloglog e sqrt)
- maxIter: número máximo de iterações no treinamento do modelo. (default= 100)
- regParam: valor de regularização (default= 0)

In [5]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("logistic").getOrCreate()

In [8]:
churn = spark.read.csv("Churn.csv", header=True, inferSchema=True, sep=";")
churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [11]:
from pyspark.ml.feature import RFormula
Rformula = RFormula(formula="Exited ~ . ", featuresCol="independente", labelCol = "dependente")
churnrf = Rformula.fit(churn).transform(churn)
churnrf.select("independente","dependente").show(5, truncate=False)

+--------------------------------------------------------------+----------+
|independente                                                  |dependente|
+--------------------------------------------------------------+----------+
|[619.0,1.0,0.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,1.0134888E7]      |1.0       |
|[608.0,0.0,0.0,0.0,41.0,1.0,8380786.0,1.0,0.0,1.0,1.1254258E7]|0.0       |
|[502.0,1.0,0.0,0.0,42.0,8.0,1596608.0,3.0,1.0,0.0,1.1393157E7]|1.0       |
|(11,[0,1,4,5,7,10],[699.0,1.0,39.0,1.0,2.0,9382663.0])        |0.0       |
|[850.0,0.0,0.0,0.0,43.0,2.0,1.2551082E7,1.0,1.0,1.0,790841.0] |0.0       |
+--------------------------------------------------------------+----------+
only showing top 5 rows



In [13]:
churnTreino, churnTeste = churnrf.randomSplit([0.8,0.2])
print(churnTreino.count())
print(churnTeste.count())

7973
2027


In [17]:
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(featuresCol="independente", labelCol="dependente", maxIter=100, 
                              regParam=0.08)
modelo = logistic.fit(churnTreino)

In [20]:
# Performance do modelo:

resumo = modelo.summary

acuracia = resumo.accuracy
precisao = resumo.weightedPrecision
recall = resumo.weightedRecall
auc = resumo.areaUnderROC

print("Acurácia: ", acuracia, "\nPrecisão: ", precisao, "\nRecall: ", recall, "\nAUC: ", auc) 


Acurácia:  0.8039633763953342 
Precisão:  0.7809395481971331 
Recall:  0.8039633763953342 
AUC:  0.7680852537190221


In [21]:
previsao = modelo.transform(churnTeste)
previsao.select("dependente","prediction","probability","rawPrediction").show(5, truncate=False)
# rawPrediction: medida da confiança da previsão 

+----------+----------+----------------------------------------+------------------------------------------+
|dependente|prediction|probability                             |rawPrediction                             |
+----------+----------+----------------------------------------+------------------------------------------+
|1.0       |0.0       |[0.8116905268216125,0.18830947317838753]|[1.4610323995471814,-1.4610323995471814]  |
|1.0       |0.0       |[0.7029842805360632,0.2970157194639368] |[0.8615494661244314,-0.8615494661244314]  |
|1.0       |0.0       |[0.6044330395877536,0.39556696041224637]|[0.42397081540959736,-0.42397081540959736]|
|1.0       |0.0       |[0.8182261450260968,0.1817738549739032] |[1.5043753999298666,-1.5043753999298666]  |
|1.0       |0.0       |[0.8813287607247073,0.11867123927529266]|[2.0050737493445867,-2.0050737493445867]  |
+----------+----------+----------------------------------------+------------------------------------------+
only showing top 5 rows



In [26]:
# Avaliador de classificação binária:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
avaliar = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="dependente", 
                                        metricName= "areaUnderROC")
areaUnderRoc = avaliar.evaluate(previsao)
print(areaUnderRoc)

0.7578062413926573
