In [10]:
!pip install pyspark

In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

## Logistic Regression

In [12]:
# Load training data
data = spark.read.format("libsvm").load("/kaggle/input/pyspark-ml-logistic-regression/sample_libsvm_data.txt")

log_reg = LogisticRegression()

# Fit the model
model = log_reg.fit(data)

summary = model.summary

In [13]:
summary.predictions.show()

In [14]:
model.evaluate(data)

In [15]:
y_hat = model.evaluate(data)

y_hat.predictions.show()

In [16]:
y_hat = y_hat.predictions.select("label", "prediction")
y_hat.show()

_**Documentacion:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.evaluation.MulticlassMetrics.html_

In [17]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [18]:
metrics = MulticlassMetrics(y_hat.rdd)

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())

# Precision
print("Precision label 0")
print(metrics.precision(label = 0.0))

print("Precision label 1")
print(metrics.precision(label = 1.0))

# Recall
print("Recall label 0")
print(metrics.recall(label = 0.0))

print("Recall label 1")
print(metrics.recall(label = 1.0))

# F1-Score
print("F1-Score label 0")
print(metrics.fMeasure(label = 0.0))

print("F1-Score label 1")
print(metrics.fMeasure(label = 1.0))

# Accuracy
print("Accuracy")
print(metrics.accuracy)

# Falsos Positivos
print("Falsos positivos label 0")
print(metrics.falsePositiveRate(label = 0.0))

print("Falsos positivos label 1")
print(metrics.falsePositiveRate(label = 1.0))

### Evaluators

**Documentacion:** Binary <br> https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.BinaryClassificationEvaluator

**Documentacion:** MultiClass <br>
https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "label")

evaluator.metricName

# Por defecto la metrica es la curva ROC

In [21]:
# Sin parametros de metricas
evaluator.evaluate(dataset = y_hat)

In [22]:
# Con parametros
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderROC"})

In [23]:
# Area Under Precision-Recall, esta metrica puede resultar util cuando las clases estan desbalanceadas
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderPR"})

In [24]:
# MultiClass
evaluator_m = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "label", metricName = "accuracy")

accuracy = evaluator.evaluate(y_hat)

accuracy

In [None]:
################################################################################################################################