# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Code Lab 10: Heart attack prediction with Logistic Regression** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [None]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Logistic-Regression") \
    .master("spark://80d04dce9402:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/25 15:09:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Preparación de Datos

In [None]:
from team_name.spark_utils import SparkUtils

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("male", "integer"),
                                    ("age", "integer"),
                                    ("education", "integer"),
                                    ("currentSmoker", "integer"),
                                    ("cigsPerDay", "integer"),
                                    ("BPMeds", "integer"),
                                    ("prevalentStroke", "integer"),
                                    ("prevalentHyp", "integer"),
                                    ("diabetes", "integer"),
                                    ("totChol", "float"),
                                    ("sysBP", "float"),
                                    ("diaBP", "float"),
                                    ("BMI", "float"),
                                    ("heartRate", "float"),
                                    ("glucose", "float"),
                                    ("TenYearCHD", "integer")
                                    ])
# Create DataFrame
heart_df = spark \
                .read \
                .schema(schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/heart_disease/framingham.csv")

In [41]:
heart_df.count()

4238

### Assemble the features into a single vector column

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, avg

assembler_input_cols = ['male', 'age', 'education', 'currentSmoker', 'cigsPerDay',
                        'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes',
                        'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']


# Llenar nulos con media
mean_values = heart_df.select([
    avg(col(c)).alias(c) for c in assembler_input_cols
]).collect()[0].asDict()

# Reemplazar valores nulos
heart_df_filled = heart_df.fillna(mean_values)

# Aplicar el ensamblador
assembler = VectorAssembler(inputCols=assembler_input_cols, outputCol="features")
data_with_features = assembler.transform(heart_df_filled)

# Renombrar la columna objetivo como "label" para usar en modelos MLlib
data_with_features = data_with_features.withColumnRenamed("TenYearCHD", "label")


# Seleccionar solo las columnas relevantes para el modelo
final_data = data_with_features.select("label", "features")
final_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[1.0,39.0,4.0,0.0...|
|    0|(15,[1,2,9,10,11,...|
|    0|[1.0,48.0,1.0,1.0...|
|    1|[0.0,61.0,3.0,1.0...|
|    0|[0.0,46.0,3.0,1.0...|
+-----+--------------------+
only showing top 5 rows



### Split the data into training and test sets 80% training data and 20% testing data

In [46]:
train_df, test_df = final_data.randomSplit([0.8, 0.2], seed=57)

### Show the whole dataset

In [47]:
print("Original Dataset")
final_data.show(5)

# Print train dataset
print("train set")
train_df.show(5)

Original Dataset
+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[1.0,39.0,4.0,0.0...|
|    0|(15,[1,2,9,10,11,...|
|    0|[1.0,48.0,1.0,1.0...|
|    1|[0.0,61.0,3.0,1.0...|
|    0|[0.0,46.0,3.0,1.0...|
+-----+--------------------+
only showing top 5 rows

train set
+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
+-----+--------------------+
only showing top 5 rows



In [48]:
print(f"Full dataset: {final_data.count()}")
print(f"Train dataset: {train_df.count()}")
print(f"Test dataset: {test_df.count()}")


Full dataset: 4238
Train dataset: 3376
Test dataset: 862


### Create a logistic regression model

In [49]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.01)


# TRAINNING

In [50]:
lr_model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(lr_model.coefficients))

# Display model summary
training_summary = lr_model.summary

Coefficients: [0.6198313640500984,0.05573756860715636,-0.003201845606875305,0.056835717915893015,0.01468454629101763,0.16046390935074994,1.0040710593385345,0.3186421380151852,0.40293190655759714,0.002000858085421182,0.013388209920605687,-0.002957611579838103,-0.0029546291518847785,0.002803604745808852,0.004031326668904733]


### Predictions

In [53]:
# Use the trained model to make predictions on the test data
predictions = lr_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction", "probability").show(20)

+--------------------+----------+--------------------+
|            features|prediction|         probability|
+--------------------+----------+--------------------+
|(15,[1,2,9,10,11,...|       0.0|[0.97126652084688...|
|(15,[1,2,9,10,11,...|       0.0|[0.97444119781175...|
|(15,[1,2,9,10,11,...|       0.0|[0.96545091302543...|
|(15,[1,2,9,10,11,...|       0.0|[0.97192974539298...|
|(15,[1,2,9,10,11,...|       0.0|[0.96419101941342...|
|(15,[1,2,9,10,11,...|       0.0|[0.97392092282790...|
|(15,[1,2,9,10,11,...|       0.0|[0.96310016663650...|
|(15,[1,2,9,10,11,...|       0.0|[0.96309095781961...|
|(15,[1,2,9,10,11,...|       0.0|[0.97327111342332...|
|(15,[1,2,9,10,11,...|       0.0|[0.97202575157040...|
|(15,[1,2,9,10,11,...|       0.0|[0.97310192617033...|
|(15,[1,2,9,10,11,...|       0.0|[0.97070979011296...|
|(15,[1,2,9,10,11,...|       0.0|[0.95119787638376...|
|(15,[1,2,9,10,11,...|       0.0|[0.97275068879996...|
|(15,[1,2,9,10,11,...|       0.0|[0.96234955565376...|
|(15,[1,2,

### Evaluation

In [54]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy:.2f} - Precision: {precision:.2f} - Recall: {recall:.2f} - F1-Score: {f1:.2f}")


Accuracy: 0.86 - Precision: 0.86 - Recall: 0.86 - F1-Score: 0.80


In [55]:
sc.stop()