<div class="alert alert-info">

# <font color = #0000> <i> <b>  Challenge 2. Comments

</div>


In [1]:
# Análisis y Modelo de Regresión Logística para Predecir la Calidad del Vino
# =========================================================================

# ## Introducción
# Este notebook está diseñado para cargar, explorar y preprocesar el dataset de calidad de vinos, y entrenar un modelo de regresión logística utilizando PySpark en un entorno de Databricks.

# ## 1. Configuración de Databricks y Bibliotecas
# Importamos las bibliotecas necesarias y configuramos la conexión con Databricks.

# Cleaning and sorting
import sys
import findspark
findspark.init()

import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

### <font color = #ff4fa4> <i> <b> Seteamos la sesión de Spark con su cluster que prevuiamente fue hallada con findspark.init() </b>:

In [2]:

spark = SparkSession.builder\
        .master("local[*]")\
        .appName('WineQualityPrediction')\
        .getOrCreate()

In [6]:
# ## 2. Carga de Datos
# Cargamos el archivo CSV con los datos del vino y revisamos las primeras filas.

# Cargar el archivo CSV

df = spark.read.csv('data/winequality-red.csv', 
                       inferSchema = True,
                       sep=";", 
                       header = True)


In [7]:
type(df)

pyspark.sql.dataframe.DataFrame

In [10]:
df.count()

1599

In [11]:
len(df.columns)

12

In [20]:
to_show_in_Pandas = df.toPandas()
to_show_in_Pandas

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,label
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,0


In [14]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [17]:
#describe_pd = df.describe().toPandas()
describe_pd

Unnamed: 0,summary,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
1,mean,8.319637273295838,0.5278205128205131,0.2709756097560964,2.538805503439652,0.0874665415884925,15.87492182614134,46.46779237023139,0.9967466791744832,3.311113195747343,0.6581488430268921,10.422983114446502,5.636022514071295
2,stddev,1.7410963181276948,0.1790597041535352,0.1948011374053182,1.40992805950728,0.04706530201009,10.46015696980971,32.89532447829907,0.0018873339538427,0.1543864649035427,0.1695069795901101,1.0656675818473935,0.8075694397347051
3,min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
4,max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


### <font color = #ff4fa4> <i> <b> Añade una columna 'label' cuyos valores: <br> Si la columna 'quality' >= 6: TRUE: <br> TRUE: 1 ; FALSE: 0 </b>:

In [22]:
# Convertir la columna "quality" a una variable binaria en una columna llamada "label"
df = df.withColumn("label", (col("quality") >= 6).cast("int"))
df.show()

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-----+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|label|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-----+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|    0|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|    0|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|    0|
|         11.2|            0.28|       0.56|           1.9|    0

### <font color = #ff4fa4> <i> <b> This is typically used in machine learning tasks to define feature columns (input variables for a model) while excluding columns that represent the target variable ("quality") or irrelevant data ("label"). </b>

### <font color = #ff4fa4> <i> <b> set() eficientiza operaciones como si se trabajara con tuplas. se remueve en {} las columnas no deseadas con '-' y se vuelve a transformar en listas   </b>

In [24]:
# Selección de características
feature_columns = list(set(df.columns) - {"quality", "label"})
##feature_columns = [c for c in df.columns if c not in ["quality", "label"]]

<div class="alert alert-info">

### <font color = #0000> <i> <b> new_column.transform(df) = <br> 'plug (new_column) to this 'df' through .transform()) <br> seguirá siendo usado
</div>


In [48]:
# Ensamblaje de las características en un solo vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
## add the assembler column to df through .transform()
df = assembler.transform(df)

Unnamed: 0,pH,sulphates,alcohol,quality,label,features
0,3.51,0.56,9.4,5,0,"[1.9, 0.56, 34.0, 9.4, 0.0, 0.076, 11.0, 3.51,..."
1,3.20,0.68,9.8,5,0,"[2.6, 0.68, 67.0, 9.8, 0.0, 0.098, 25.0, 3.2, ..."
2,3.26,0.65,9.8,5,0,"[2.3, 0.65, 54.0, 9.8, 0.04, 0.092, 15.0, 3.26..."
3,3.16,0.58,9.8,6,1,"[1.9, 0.58, 60.0, 9.8, 0.56, 0.075, 17.0, 3.16..."
4,3.51,0.56,9.4,5,0,"[1.9, 0.56, 34.0, 9.4, 0.0, 0.076, 11.0, 3.51,..."
...,...,...,...,...,...,...
1594,3.45,0.58,10.5,5,0,"[2.0, 0.58, 44.0, 10.5, 0.08, 0.09, 32.0, 3.45..."
1595,3.52,0.76,11.2,6,1,"[2.2, 0.76, 51.0, 11.2, 0.1, 0.062, 39.0, 3.52..."
1596,3.42,0.75,11.0,6,1,"[2.3, 0.75, 40.0, 11.0, 0.13, 0.076, 29.0, 3.4..."
1597,3.57,0.71,10.2,5,0,"[2.0, 0.71, 44.0, 10.2, 0.12, 0.075, 32.0, 3.5..."


In [53]:
##How does it look in Pandas
model_to_see_in_pandas = df.toPandas()
model_to_see_in_pandas.iloc[:, 8:15]

Unnamed: 0,pH,sulphates,alcohol,quality,label,features
0,3.51,0.56,9.4,5,0,"[1.9, 0.56, 34.0, 9.4, 0.0, 0.076, 11.0, 3.51,..."
1,3.20,0.68,9.8,5,0,"[2.6, 0.68, 67.0, 9.8, 0.0, 0.098, 25.0, 3.2, ..."
2,3.26,0.65,9.8,5,0,"[2.3, 0.65, 54.0, 9.8, 0.04, 0.092, 15.0, 3.26..."
3,3.16,0.58,9.8,6,1,"[1.9, 0.58, 60.0, 9.8, 0.56, 0.075, 17.0, 3.16..."
4,3.51,0.56,9.4,5,0,"[1.9, 0.56, 34.0, 9.4, 0.0, 0.076, 11.0, 3.51,..."
...,...,...,...,...,...,...
1594,3.45,0.58,10.5,5,0,"[2.0, 0.58, 44.0, 10.5, 0.08, 0.09, 32.0, 3.45..."
1595,3.52,0.76,11.2,6,1,"[2.2, 0.76, 51.0, 11.2, 0.1, 0.062, 39.0, 3.52..."
1596,3.42,0.75,11.0,6,1,"[2.3, 0.75, 40.0, 11.0, 0.13, 0.076, 29.0, 3.4..."
1597,3.57,0.71,10.2,5,0,"[2.0, 0.71, 44.0, 10.2, 0.12, 0.075, 32.0, 3.5..."


### <font color = #ff4fa4> <i> <b> The StandardScaler is used to scale or normalize the feature vectors, typically to ensure that all features have a mean of 0 and a standard deviation of 1. </b> </b>

In [54]:
# Estandarización de las características
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)

In [56]:
model_to_see_in_pandas = df.toPandas()
model_to_see_in_pandas.iloc[:, 10:16]

Unnamed: 0,alcohol,quality,label,features,scaledFeatures
0,9.4,5,0,"[1.9, 0.56, 34.0, 9.4, 0.0, 0.076, 11.0, 3.51,...","[1.3475864865502305, 3.303698770128244, 1.0335..."
1,9.8,5,0,"[2.6, 0.68, 67.0, 9.8, 0.0, 0.098, 25.0, 3.2, ...","[1.8440657184371576, 4.01163422087001, 2.03676..."
2,9.8,5,0,"[2.3, 0.65, 54.0, 9.8, 0.04, 0.092, 15.0, 3.26...","[1.6312889047713315, 3.8346503581845686, 1.641..."
3,9.8,6,1,"[1.9, 0.58, 60.0, 9.8, 0.56, 0.075, 17.0, 3.16...","[1.3475864865502305, 3.4216880119185378, 1.823..."
4,9.4,5,0,"[1.9, 0.56, 34.0, 9.4, 0.0, 0.076, 11.0, 3.51,...","[1.3475864865502305, 3.303698770128244, 1.0335..."
...,...,...,...,...,...
1594,10.5,5,0,"[2.0, 0.58, 44.0, 10.5, 0.08, 0.09, 32.0, 3.45...","[1.4185120911055058, 3.4216880119185378, 1.337..."
1595,11.2,6,1,"[2.2, 0.76, 51.0, 11.2, 0.1, 0.062, 39.0, 3.52...","[1.5603633002160564, 4.483591188031188, 1.5503..."
1596,11.0,6,1,"[2.3, 0.75, 40.0, 11.0, 0.13, 0.076, 29.0, 3.4...","[1.6312889047713315, 4.4245965671360405, 1.215..."
1597,10.2,5,0,"[2.0, 0.71, 44.0, 10.2, 0.12, 0.075, 32.0, 3.5...","[1.4185120911055058, 4.188618083555451, 1.3375..."


### <font color = #ff4fa4> <i> <b> Separa aleatoriamente (seed = 42) el df en dos. Uno de prueba (.80) y el otro (.20) donde se probarán los datos </b>

In [57]:
# Dividir los datos en conjuntos de entrenamiento y prueba
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

### <font color = #ff4fa4> <i> <b> Ya separado el df. se le aplica la regresión logistica (variables binomiales). Featurescol: variables ecplicativas labelcol: Variable dependiente y se le aplica fit para entrenar el modelo  </b>

In [58]:
# Entrenar el modelo de regresión logística
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="label")
lr_model = lr.fit(train_data)

### <font color = #ff4fa4> <i> <b> Applies the trained logistic regression model (lr_model) to the test data (test_data) to make predictions</b>

In [66]:
# ## 6. Evaluación del Modelo
# Realizar predicciones en el conjunto de prueba
predictions = lr_model.transform(test_data)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+----------------+-------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|         alcohol|quality|label|            features|      scaledFeatures|       rawPrediction|         probability|prediction|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+----------------+-------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|          4.9|            0.42|        0.0|           2.1|    0.048|               16.0|                42.0|0.99154|3.71|     0.74|            14.0|      7|    1|[2.1,0.74,42.0,14...|[1.48943769566078...|[-3.9923945913

In [72]:
model_to_see_in_pandas = predictions.toPandas()
model_to_see_in_pandas.loc[:, 'label':'prediction']

Unnamed: 0,label,features,scaledFeatures,rawPrediction,probability,prediction
0,1,"[2.1, 0.74, 42.0, 14.0, 0.0, 0.048, 16.0, 3.71...","[1.489437695660781, 4.3656019462408935, 1.2767...","[-3.9923945913747687, 3.9923945913747687]","[0.01812103566954337, 0.9818789643304566]",1.0
1,1,"[1.2, 0.59, 46.0, 12.5, 0.0, 0.041, 16.0, 4.01...","[0.8511072546633035, 3.480682632813685, 1.3983...","[-1.0720692805092469, 1.0720692805092469]","[0.25500976345337223, 0.7449902365466278]",1.0
2,0,"[1.6, 0.62, 96.0, 11.5, 0.24, 0.05, 32.0, 3.74...","[1.1348096728844046, 3.657666495499127, 2.9183...","[1.355881368681736, -1.355881368681736]","[0.7950894967723428, 0.20491050322765725]",0.0
3,0,"[1.8, 0.55, 50.0, 9.2, 0.25, 0.103, 13.0, 3.38...","[1.2766608819949552, 3.2447041492330966, 1.519...","[1.1746862038269654, -1.1746862038269654]","[0.7639910242923175, 0.23600897570768253]",0.0
4,1,"[2.2, 0.88, 89.0, 13.5666666666667, 0.11, 0.04...","[1.5603633002160564, 5.191526638772954, 2.7055...","[-2.9841912751806596, 2.9841912751806596]","[0.048145190017380136, 0.9518548099826198]",1.0
...,...,...,...,...,...,...
270,1,"[2.9, 0.8, 43.0, 10.0, 0.68, 0.085, 17.0, 3.06...","[2.0568425321029835, 4.719569671611777, 1.3071...","[-0.9762066016468509, 0.9762066016468509]","[0.27364512491761456, 0.7263548750823854]",1.0
271,1,"[3.0, 0.93, 15.0, 12.0, 0.67, 0.093, 6.0, 3.02...","[2.1277681366582586, 5.48649974324869, 0.45599...","[-3.1732868600782744, 3.1732868600782744]","[0.04018345358914593, 0.9598165464108541]",1.0
272,1,"[2.2, 0.84, 24.0, 9.2, 0.44, 0.075, 10.0, 3.07...","[1.5603633002160564, 4.955548155192365, 0.7295...","[-1.6966999412606327, 1.6966999412606327]","[0.1548967633596015, 0.8451032366403985]",1.0
273,0,"[4.2, 0.74, 23.0, 11.1, 0.49, 0.095, 10.0, 2.9...","[2.978875391321562, 4.3656019462408935, 0.6991...","[-1.7694245283244001, 1.7694245283244001]","[0.14561390905506064, 0.8543860909449393]",1.0


### <font color = #ff4fa4> <i> <b> The ROC Curve It measures the classifier's ability to distinguish between the positive and negative classes.</b>:

In [62]:
# Evaluador para medir el área bajo la curva ROC
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
print(f"Área bajo la curva ROC: {roc_auc:.2f}")

Área bajo la curva ROC: 0.84


### <font color = #ff4fa4> <i> <b> Accuracy=Total Predictions/Correct Predictions

In [73]:
# Calcular la precisión y mostrar el informe de clasificación
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)
accuracy = evaluator_accuracy.evaluate(predictions)
print(f"Exactitud del modelo: {accuracy:.2f}")

Exactitud del modelo: 0.77


In [65]:
# Mostrar una muestra de las predicciones
predictions.select("label", "prediction", "probability").show(50)

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|    1|       1.0|[0.01812103566954...|
|    1|       1.0|[0.25500976345337...|
|    0|       0.0|[0.79508949677234...|
|    0|       0.0|[0.76399102429231...|
|    1|       1.0|[0.04814519001738...|
|    1|       1.0|[0.15700770900180...|
|    0|       0.0|[0.86473870704574...|
|    0|       1.0|[0.08927601562779...|
|    1|       1.0|[0.02939243749370...|
|    1|       1.0|[0.36096475865873...|
|    0|       0.0|[0.85208381471275...|
|    0|       0.0|[0.64187788888959...|
|    1|       1.0|[0.06993108821720...|
|    1|       1.0|[0.24315454319205...|
|    0|       0.0|[0.66147912450403...|
|    1|       1.0|[0.16739006022887...|
|    1|       1.0|[0.36213247512127...|
|    0|       0.0|[0.55351464607783...|
|    1|       1.0|[0.20606096782966...|
|    0|       1.0|[0.19979776370801...|
|    1|       1.0|[0.30680127788709...|
|    1|       1.0|[0.35211164406358...|
