### Implementación de modelos ML

Librerias

In [48]:
import numpy as np
import pandas as pd 
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.ml.feature import MinMaxScaler
from pyspark.sql.functions import round
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

We will use a local Spark cluster using all available cores, which will be accessible via a SparkSession object.

In [2]:
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('model_ML') \
                    .getOrCreate()
# What version of Spark?
print(spark.version)

3.4.1


#### Exploración de datos

In [3]:
df = spark.read.csv('./data/diabetes.csv',sep=',',header=True,inferSchema=True,nullValue='NA')

In [4]:
# Get number of records
print("The data contain %d records." % df.count())
# View the first five records
df.show(5)

The data contain 768 records.
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [5]:
# Check column data types
print(df.dtypes)

[('Pregnancies', 'int'), ('Glucose', 'int'), ('BloodPressure', 'int'), ('SkinThickness', 'int'), ('Insulin', 'int'), ('BMI', 'double'), ('DiabetesPedigreeFunction', 'double'), ('Age', 'int'), ('Outcome', 'int')]


#### Preparación de datos

In [7]:
print("The number of records: ", df.count())

The number of records:  768


In [6]:
df=df.dropDuplicates()
print("The number of records after removing the duplicate : ", df.count())

The number of records after removing the duplicate :  768


In [8]:
# Remove records with missing values 
df = df.dropna()
print(df.count())

768


In [10]:
assembler = VectorAssembler(
    inputCols=["Pregnancies", "Glucose", "BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"],
    outputCol="features"
)

assembled_df = assembler.transform(df)
assembled_df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          4|    129|           60|           12|    231|27.5|                   0.527| 31|      0|[4.0,129.0,60.0,1...|
|          2|    105|           58|           40|     94|34.9|                   0.225| 25|      0|[2.0,105.0,58.0,4...|
|          3|    129|           64|           29|    115|26.4|                   0.219| 28|      1|[3.0,129.0,64.0,2...|
|          7|    136|           90|            0|      0|29.9|                    0.21| 50|      0|[7.0,136.0,90.0,0...|
|          6|      0|           68|           41|      0|39.0|                   0.727| 41|      1|[6.0,0.0,68.0,41....|
|         12|     84|           

In [11]:
# Aplicar MinMaxScaler
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Ajustar el scaler en el DataFrame
scalerModel = scaler.fit(assembled_df)

# Transformar los datos
scaledData = scalerModel.transform(assembled_df)

In [12]:
scaledData.select("scaledFeatures","Outcome").show()

+--------------------+-------+
|      scaledFeatures|Outcome|
+--------------------+-------+
|[0.23529411764705...|      0|
|[0.11764705882352...|      0|
|[0.17647058823529...|      1|
|[0.41176470588235...|      0|
|[0.35294117647058...|      1|
|[0.70588235294117...|      1|
|[0.17647058823529...|      0|
|[0.35294117647058...|      0|
|[0.58823529411764...|      1|
|[0.17647058823529...|      1|
|[0.29411764705882...|      1|
|[0.70588235294117...|      0|
|[0.05882352941176...|      0|
|[0.11764705882352...|      0|
|[0.29411764705882...|      1|
|[0.58823529411764...|      0|
|[0.29411764705882...|      0|
|[0.0,0.4874371859...|      0|
|[0.0,0.7085427135...|      0|
|[0.11764705882352...|      0|
+--------------------+-------+
only showing top 20 rows



#### Models

In [13]:
# Split into training and testing sets in a 80:20 ratio
df_train, df_test = scaledData.randomSplit([0.8, 0.2], seed=42)

# Check that training set has around 80% of records
training_ratio = df_train.count() / scaledData.count()
print(training_ratio)

0.83984375


##### Decision Trees

In [21]:
# Create a DT classifier object and fit to the training data
tree = DecisionTreeClassifier(featuresCol="features", labelCol="Outcome")

In [24]:
tree_model = tree.fit(df_train)

In [26]:
# Create predictions on test data
prediction = tree_model.transform(df_test)
prediction.select('Outcome', 'prediction', 'probability').show(10,False)

+-------+----------+-----------------------------------------+
|Outcome|prediction|probability                              |
+-------+----------+-----------------------------------------+
|0      |0.0       |[0.9915966386554622,0.008403361344537815]|
|0      |0.0       |[0.8392857142857143,0.16071428571428573] |
|0      |0.0       |[0.8392857142857143,0.16071428571428573] |
|0      |0.0       |[0.9915966386554622,0.008403361344537815]|
|0      |0.0       |[0.9915966386554622,0.008403361344537815]|
|0      |0.0       |[0.9915966386554622,0.008403361344537815]|
|0      |0.0       |[0.9915966386554622,0.008403361344537815]|
|0      |0.0       |[0.8392857142857143,0.16071428571428573] |
|0      |0.0       |[0.9714285714285714,0.02857142857142857] |
|1      |0.0       |[0.8392857142857143,0.16071428571428573] |
+-------+----------+-----------------------------------------+
only showing top 10 rows



Evaluate the model 

A confusion matrix gives a useful breakdown of predictions versus known values. It has four cells which represent the counts of: True Negatives (TN) — prediction is negative & label is negative

True Positives (TP) — prediction is positive & label is positive

False Negatives (FN) — prediction is negative & label is positive

False Positives (FP) — prediction is positive & label is negative

In [27]:
# Create a confusion matrix
prediction.groupBy('Outcome', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND Outcome = prediction').count()
TP = prediction.filter('prediction = 1 AND Outcome = prediction').count()
FN = prediction.filter('prediction = 0 AND Outcome != prediction').count()
FP = prediction.filter('prediction = 1 AND Outcome != prediction').count()

+-------+----------+-----+
|Outcome|prediction|count|
+-------+----------+-----+
|      1|       0.0|   18|
|      0|       0.0|   70|
|      1|       1.0|   25|
|      0|       1.0|   10|
+-------+----------+-----+



In [40]:
eval_accuracy = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
eval_precision = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="precisionByLabel")
eval_recall = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="recallByLabel")
eval_f1 = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="f1")

In [42]:
accuracy = eval_accuracy.evaluate(prediction)
precision = eval_precision.evaluate(prediction)
recall = eval_recall.evaluate(prediction)
f1score = eval_f1.evaluate(prediction)
auc = eval_accuracy.evaluate(prediction)

print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1 Score:',f1score)
print('AUC:',auc)


Accuracy: 0.7723577235772358
Precision: 0.7954545454545454
Recall: 0.875
F1 Score: 0.7661038148843028
AUC: 0.7723577235772358


##### Logistic Regression

In [43]:
# Create a classifier object and train on training data
logistic = LogisticRegression(featuresCol="features", labelCol="Outcome").fit(df_train)
# Create predictions for the testing data and show confusion matrix
prediction = logistic.transform(df_test)
prediction.groupBy('Outcome', 'prediction').count().show()

+-------+----------+-----+
|Outcome|prediction|count|
+-------+----------+-----+
|      1|       0.0|   18|
|      0|       0.0|   71|
|      1|       1.0|   25|
|      0|       1.0|    9|
+-------+----------+-----+



In [44]:
accuracy = eval_accuracy.evaluate(prediction)
precision = eval_precision.evaluate(prediction)
recall = eval_recall.evaluate(prediction)
f1score = eval_f1.evaluate(prediction)
auc = eval_accuracy.evaluate(prediction)

print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1 Score:',f1score)
print('AUC:',auc)

Accuracy: 0.7804878048780488
Precision: 0.797752808988764
Recall: 0.8875
F1 Score: 0.7735041693765896
AUC: 0.7804878048780488


##### GBT

In [46]:
# Create a classifier object and train on training data
gbt = GBTClassifier(featuresCol="features", labelCol="Outcome", maxIter=10).fit(df_train)
# Make predictions.
prediction = gbt.transform(df_test)
# Select example rows to display.
prediction.groupBy('Outcome', 'prediction').count().show()

+-------+----------+-----+
|Outcome|prediction|count|
+-------+----------+-----+
|      1|       0.0|   19|
|      0|       0.0|   68|
|      1|       1.0|   24|
|      0|       1.0|   12|
+-------+----------+-----+



In [47]:
accuracy = eval_accuracy.evaluate(prediction)
precision = eval_precision.evaluate(prediction)
recall = eval_recall.evaluate(prediction)
f1score = eval_f1.evaluate(prediction)
auc = eval_accuracy.evaluate(prediction)

print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1 Score:',f1score)
print('AUC:',auc)

Accuracy: 0.7479674796747967
Precision: 0.7816091954022989
Recall: 0.85
F1 Score: 0.7420836006283205
AUC: 0.7479674796747967


##### Naive Bayes

In [49]:
# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial",featuresCol="features", labelCol="Outcome").fit(df_train)

# select example rows to display.
prediction = nb.transform(df_test)
# Select example rows to display.
prediction.groupBy('Outcome', 'prediction').count().show()

+-------+----------+-----+
|Outcome|prediction|count|
+-------+----------+-----+
|      1|       0.0|   24|
|      0|       0.0|   55|
|      1|       1.0|   19|
|      0|       1.0|   25|
+-------+----------+-----+



In [50]:
accuracy = eval_accuracy.evaluate(prediction)
precision = eval_precision.evaluate(prediction)
recall = eval_recall.evaluate(prediction)
f1score = eval_f1.evaluate(prediction)
auc = eval_accuracy.evaluate(prediction)

print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('F1 Score:',f1score)
print('AUC:',auc)

Accuracy: 0.6016260162601627
Precision: 0.6962025316455697
Recall: 0.6875
F1 Score: 0.6026627735372994
AUC: 0.6016260162601627
