# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Spark: Structured Streaming (Kafka + Watermarking)** </center>

---

**Lab 10**

**Fecha**: 25 abril 2025

**Nombre del Equipo**: Arriba Linux

**Integrantes del Equipo**: Tirzah Peniche Barba / Ana Cristina Luna Arellano / Juan Pedro Bihouet

**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Arriba-Linux-MLSpark-Logistic-Regression") \
    .master("spark://ac7f0d7e8e91:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/27 00:18:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Preparación de Datos

In [7]:
from ArribaLinux.spark_utils import SparkUtils

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([
    ("male", "integer"), 
    ("age", "integer"), 
    ("education", "integer"), 
    ("currentSmoker", "integer"), 
    ("cigsPerDay", "integer"), 
    ("BPMeds", "integer"), 
    ("prevalentStroke", "integer"), 
    ("prevalentHyp", "integer"), 
    ("diabetes", "integer"), 
    ("totChol", "integer"), 
    ("sysBP", "float"), 
    ("diaBP", "float"), 
    ("BMI", "float"), 
    ("heartRate", "integer"), 
    ("glucose", "integer"), 
    ("TenYearCHD", "integer")])

data = spark \
            .read \
            .schema(schema) \
            .option("header", "true") \
            .csv("/home/jovyan/notebooks/data/framingham.csv")

data.show(40)

+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|male|age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|  BMI|heartRate|glucose|TenYearCHD|
+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|   1| 39|        4|            0|         0|     0|              0|           0|       0|    195|106.0| 70.0|26.97|       80|     77|         0|
|   0| 46|        2|            0|         0|     0|              0|           0|       0|    250|121.0| 81.0|28.73|       95|     76|         0|
|   1| 48|        1|            1|        20|     0|              0|           0|       0|    245|127.5| 80.0|25.34|       75|     70|         0|
|   0| 61|        3|            1|        30|     0|              0|           1|       0|    225|150.0| 95.0|28.58|       6

In [10]:
df = data.dropna()
df.show(40)

+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|male|age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|  BMI|heartRate|glucose|TenYearCHD|
+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|   1| 39|        4|            0|         0|     0|              0|           0|       0|    195|106.0| 70.0|26.97|       80|     77|         0|
|   0| 46|        2|            0|         0|     0|              0|           0|       0|    250|121.0| 81.0|28.73|       95|     76|         0|
|   1| 48|        1|            1|        20|     0|              0|           0|       0|    245|127.5| 80.0|25.34|       75|     70|         0|
|   0| 61|        3|            1|        30|     0|              0|           1|       0|    225|150.0| 95.0|28.58|       6

In [11]:
df = df.withColumnRenamed("TenYearCHD", "label")
df.show(5)

+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+-----+
|male|age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|  BMI|heartRate|glucose|label|
+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+-----+
|   1| 39|        4|            0|         0|     0|              0|           0|       0|    195|106.0| 70.0|26.97|       80|     77|    0|
|   0| 46|        2|            0|         0|     0|              0|           0|       0|    250|121.0| 81.0|28.73|       95|     76|    0|
|   1| 48|        1|            1|        20|     0|              0|           0|       0|    245|127.5| 80.0|25.34|       75|     70|    0|
|   0| 61|        3|            1|        30|     0|              0|           1|       0|    225|150.0| 95.0|28.58|       65|    103|    1|
|   0| 46|   

### Assemble the features into a single vector column

In [20]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["male", "age", "education", "currentSmoker", "cigsPerDay", "BPMeds", "prevalentHyp", "diabetes", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose"], outputCol="features")
data_with_features = assembler.transform(df).select("label", "features")

### Split the data into training and test sets 80% training data and 20% testing data

In [22]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

### Show the whole dataset

In [23]:
print("Original Dataset")
data_with_features.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset
+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[1.0,39.0,4.0,0.0...|
|    0|(14,[1,2,8,9,10,1...|
|    0|[1.0,48.0,1.0,1.0...|
|    1|[0.0,61.0,3.0,1.0...|
|    0|[0.0,46.0,3.0,1.0...|
|    0|[0.0,43.0,2.0,0.0...|
|    1|(14,[1,2,8,9,10,1...|
|    0|[0.0,45.0,2.0,1.0...|
|    0|[1.0,52.0,1.0,0.0...|
|    0|[1.0,43.0,1.0,1.0...|
|    0|(14,[1,2,8,9,10,1...|
|    0|(14,[1,2,8,9,10,1...|
|    0|[1.0,46.0,1.0,1.0...|
|    0|[0.0,41.0,3.0,0.0...|
|    1|[0.0,38.0,2.0,1.0...|
|    0|[1.0,48.0,3.0,1.0...|
|    1|[0.0,46.0,2.0,1.0...|
|    0|[0.0,38.0,2.0,1.0...|
|    0|[1.0,41.0,2.0,0.0...|
|    0|[0.0,42.0,2.0,1.0...|
+-----+--------------------+
only showing top 20 rows

train set
+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(14,[1,2,8,9,10,1...|
|    0|(14,[1,2,8,9,10,1...|
|    0|(14,[1,2,8,9,10,1...|
|    0|(14,[1,2,8,9,10,1...|
|    0|(14,[1,2,8,9,10,1...|
|    0|(14,[1,2,8,9

### Create a logistic regression model

In [24]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.01)


# TRAINNING

In [25]:
lr_model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(lr_model.coefficients))

# Display model summary
training_summary = lr_model.summary

25/04/27 00:41:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/04/27 00:41:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Coefficients: [0.56085619567335,0.0580024803047417,-0.037925907429923016,0.08648896022474624,0.013624766035835296,0.19337085407778754,0.2285286223807938,0.1940681366366223,0.0018397996505330062,0.013786143565660425,-1.1969227679968942e-05,0.0068020602545334884,-0.0019378486256763628,0.006861897807201568]


# PREDICTIONS

In [26]:
# Use the trained model to make predictions on the test data
predictions = lr_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction", "probability").show()

+--------------------+----------+--------------------+
|            features|prediction|         probability|
+--------------------+----------+--------------------+
|(14,[1,2,8,9,10,1...|       0.0|[0.97672608102472...|
|(14,[1,2,8,9,10,1...|       0.0|[0.97127909326952...|
|(14,[1,2,8,9,10,1...|       0.0|[0.96801865150182...|
|(14,[1,2,8,9,10,1...|       0.0|[0.97989170367762...|
|(14,[1,2,8,9,10,1...|       0.0|[0.95371875673041...|
|(14,[1,2,8,9,10,1...|       0.0|[0.96800687277777...|
|(14,[1,2,8,9,10,1...|       0.0|[0.97371525468766...|
|(14,[1,2,8,9,10,1...|       0.0|[0.96970892143587...|
|(14,[1,2,8,9,10,1...|       0.0|[0.95466272893662...|
|(14,[1,2,8,9,10,1...|       0.0|[0.96395707683715...|
|(14,[1,2,8,9,10,1...|       0.0|[0.97570145626707...|
|(14,[1,2,8,9,10,1...|       0.0|[0.95019395775623...|
|(14,[1,2,8,9,10,1...|       0.0|[0.95941523860472...|
|(14,[1,2,8,9,10,1...|       0.0|[0.96490540659223...|
|(14,[1,2,8,9,10,1...|       0.0|[0.97346776413514...|
|(14,[1,2,

In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})

precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print('f1: {}'.format(f1))

f1: 0.7697118793492835


In [None]:
sc.stop()