In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

In [19]:
# IMPORTAR FUNCIONES PROPIAS
import sys
sys.path.append('../../06.Big_data_PySpark')

In [20]:
# Metricas
from funciones import obtener_metricas_clasificacion

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Logistic Regression

In [3]:
# Load training data
data = spark.read.format("libsvm").load("../02.data/sample_libsvm_data.txt")

**TRAIN Y TEST**

In [15]:
# .randomSplit()

train, test = data.randomSplit(weights = [0.7, 0.3], seed = 42)

In [16]:
# INICIO MODELO VACIO
log_reg = LogisticRegression()

# Fit the model
model = log_reg.fit(train)

# Predicciones
y_hat = model.evaluate(test)

In [17]:
y_hat.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[100,101,102...|[3.65011628362891...|[0.97467016776878...|       0.0|
|  0.0|(692,[123,124,125...|[45.9467283021876...|           [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|[33.3618723288205...|[0.99999999999999...|       0.0|
|  0.0|(692,[124,125,126...|[60.9473198030384...|           [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|[33.8179782963031...|[0.99999999999999...|       0.0|
|  0.0|(692,[124,125,126...|[37.5584938791617...|           [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|[45.8622454395488...|           [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|[21.7144853715542...|[0.99999999962887...|       0.0|
|  0.0|(692,[126,127,128...|[35.9974214525556...|[0.99999999999999...|       0.0|
|  0.0|(692,[126

In [18]:
y_hat = y_hat.predictions.select("label", "prediction")

In [21]:
obtener_metricas_clasificacion(y_hat)



{'Confusion Matrix': array([[19.,  1.],
        [ 0., 15.]]),
 'Precision label 0': 1.0,
 'Precision label 1': 0.9375,
 'Recall label 0': 0.95,
 'Recall label 1': 1.0,
 'F1-Score label 0': 0.9743589743589743,
 'F1-Score label 1': 0.967741935483871,
 'Accuracy': 0.9714285714285714,
 'Falsos positivos label 0': 0.0,
 'Falsos positivos label 1': 0.05}

_**Documentacion:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.evaluation.MulticlassMetrics.html_

### Evaluators == PREDICT

**Documentacion:** Binary <br> https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.BinaryClassificationEvaluator

**Documentacion:** MultiClass <br>
https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator

In [33]:
# Inicio objeto evaluador para problemas binarios
evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "label")

# No tiene accuracy

In [34]:
# Sin parametros de metricas, ROC como defecto
evaluator.evaluate(dataset = y_hat)

0.96875

In [35]:
# Con parametros
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderROC"})

0.96875

In [36]:
# Area Under Precision-Recall, esta metrica puede resultar util cuando las clases estan desbalanceadas
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderPR"})

0.9830357142857142

In [37]:
# MultiClass
evaluator_m = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "label", metricName = "accuracy")

accuracy = evaluator.evaluate(y_hat)

accuracy

0.96875

In [None]:
################################################################################################################################