# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Carrera: Ing. en Sistemas Computacionales** </center>
---
### <center> **Primavera 2025** </center>
---

**Lab 10**: Heart attack prediction with Logistic Regression

**Fecha**: 11 de mayo del 2025

**Nombre del Estudiante**: Marco Albanese, Vicente Siloe

**Profesor**: Pablo Camarillo Ramirez

In [None]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Logistic-Regression") \
    .master("spark://2da3617855ce:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/07 07:47:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Preparación de datos

In [3]:
from equipo_mcqueen.spark_utils import SparkUtils

heart_attack_data = [
    ("male", "IntegerType"), 
    ("age", "IntegerType"), 
    ("education", "IntegerType"), 
    ("currentSmoker", "IntegerType"), 
    ("cigsPerDay", "IntegerType"), 
    ("BPMeds", "IntegerType"), 
    ("prevalentStroke", "IntegerType"), 
    ("prevalentHyp", "IntegerType"), 
    ("diabetes", "IntegerType"), 
    ("totChol", "IntegerType"), 
    ("sysBP", "FloatType"), 
    ("diaBP", "FloatType"), 
    ("BMI", "FloatType"), 
    ("heartRate", "IntegerType"), 
    ("glucose", "IntegerType"), 
    ("TenYearCHD", "IntegerType")
]

# Define schema for the DataFrame
schema = SparkUtils.generate_schema(heart_attack_data)

# Convert list to a DataFrame
df = spark.read.schema(schema).option("header", "true").csv("/home/jovyan/notebooks/data/framingham.csv")

# Drop rows with null values
df = df.na.drop()

#### Assemble the features into a single vector column

In [4]:
from pyspark.ml.feature import VectorAssembler

heart_attack_cols = [
    'male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 
    'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 
    'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose'
]

assembler = VectorAssembler(inputCols=heart_attack_cols, outputCol="features")
data_with_features = assembler.transform(df).select("TenYearCHD", "features")

#### Split the data into training and test sets 80% training data and 20% testing data

In [5]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=42)

#### Show the whole dataset

In [6]:
print("Original Dataset")
data_with_features.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset


25/05/07 07:48:06 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+----------+--------------------+
|TenYearCHD|            features|
+----------+--------------------+
|         0|[1.0,39.0,4.0,0.0...|
|         0|(15,[1,2,9,10,11,...|
|         0|[1.0,48.0,1.0,1.0...|
|         1|[0.0,61.0,3.0,1.0...|
|         0|[0.0,46.0,3.0,1.0...|
|         0|[0.0,43.0,2.0,0.0...|
|         1|(15,[1,2,9,10,11,...|
|         0|[0.0,45.0,2.0,1.0...|
|         0|[1.0,52.0,1.0,0.0...|
|         0|[1.0,43.0,1.0,1.0...|
|         0|(15,[1,2,9,10,11,...|
|         0|(15,[1,2,9,10,11,...|
|         0|[1.0,46.0,1.0,1.0...|
|         0|[0.0,41.0,3.0,0.0...|
|         1|[0.0,38.0,2.0,1.0...|
|         0|[1.0,48.0,3.0,1.0...|
|         1|[0.0,46.0,2.0,1.0...|
|         0|[0.0,38.0,2.0,1.0...|
|         0|[1.0,41.0,2.0,0.0...|
|         0|[0.0,42.0,2.0,1.0...|
+----------+--------------------+
only showing top 20 rows

train set
+----------+--------------------+
|TenYearCHD|            features|
+----------+--------------------+
|         0|(15,[1,2,9,10,11,...|
|         0|

#### Create a logistic regression model

In [9]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="TenYearCHD", maxIter=10, regParam=0.01)

#### Training

In [10]:
lr_model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(lr_model.coefficients))

# Display model summary
training_summary = lr_model.summary

25/05/07 07:49:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/07 07:49:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Coefficients: [0.5082844047138703,0.059286285190248735,-0.06811641200944737,0.20379021742850734,0.012573716113124526,0.4988565787897998,0.867662655614685,0.22752355015488057,-0.05674527489077539,0.0015813256796855003,0.011987921346163563,0.0008730866066968816,0.0035229175459439246,-0.0012276670059417456,0.007146752809847647]


#### Predictions

In [11]:
# Use the trained model to make predictions on the test data
predictions = lr_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction", "probability").show()

+--------------------+----------+--------------------+
|            features|prediction|         probability|
+--------------------+----------+--------------------+
|(15,[1,2,9,10,11,...|       0.0|[0.97777779601108...|
|(15,[1,2,9,10,11,...|       0.0|[0.97546664709744...|
|(15,[1,2,9,10,11,...|       0.0|[0.97702545674191...|
|(15,[1,2,9,10,11,...|       0.0|[0.97610701231714...|
|(15,[1,2,9,10,11,...|       0.0|[0.97624162471626...|
|(15,[1,2,9,10,11,...|       0.0|[0.97680162999968...|
|(15,[1,2,9,10,11,...|       0.0|[0.97099846501360...|
|(15,[1,2,9,10,11,...|       0.0|[0.96790675336005...|
|(15,[1,2,9,10,11,...|       0.0|[0.97452447483122...|
|(15,[1,2,9,10,11,...|       0.0|[0.97023384348738...|
|(15,[1,2,9,10,11,...|       0.0|[0.96037668903675...|
|(15,[1,2,9,10,11,...|       0.0|[0.97908644346531...|
|(15,[1,2,9,10,11,...|       0.0|[0.97328591682390...|
|(15,[1,2,9,10,11,...|       0.0|[0.95937065727014...|
|(15,[1,2,9,10,11,...|       0.0|[0.97817335360252...|
|(15,[1,2,

In [None]:
sc.stop()