In [1]:
import sys
sys.executable

import findspark
findspark.init()

import pandas as pd
import numpy as np
#import pyspark.pandas as ps

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

import matplotlib as plt
import seaborn as sns

In [2]:
spark = SparkSession.builder\
        .master("local[*]")\
        .appName('HeartDiseasePrediction')\
        .getOrCreate()

In [3]:
# ## 2. Carga de Datos
# Cargamos el archivo CSV con los datos del vino y revisamos las primeras filas.
# Cargar el archivo CSV
gt = spark.read.csv('data/heart_disease_uci.csv', 
                       inferSchema = True,
                       header = True)

In [4]:
type(gt)

pyspark.sql.dataframe.DataFrame

In [5]:
df = gt.toPandas()

In [6]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [7]:
spark_df = spark.createDataFrame(df)

### <font color = #ff4fa4> <i> <b>  FILLING nulls WITH MEAN? 
> * This technique is commonly used in data preprocessing when the goal is to fill missing values with a reasonable estimate (like the mean) , which ensures that the dataset remains intact and avoids losing data points that might be useful for analysis or modeling. </b>:

In [8]:
# 3. Preprocesamiento: Imputar valores nulos
# Calcular las medias para las columnas con valores nulos y rellenarlos
mean_trestbps = spark_df.select("trestbps").agg({"trestbps": "mean"}).first()[0]
mean_chol = spark_df.select("chol").agg({"chol": "mean"}).first()[0]
mean_thalch = spark_df.select("thalch").agg({"thalch": "mean"}).first()[0]
mean_oldpeak = spark_df.select("oldpeak").agg({"oldpeak": "mean"}).first()[0]

In [9]:
# Aplicar imputación
spark_df = spark_df.na.fill({
    "trestbps": mean_trestbps,
    "chol": mean_chol,
    "thalch": mean_thalch,
    "oldpeak": mean_oldpeak
})

In [10]:
# 4. Convertir columnas booleanas a tipo string
# Aquí hacemos la conversión de "exang" y cualquier otra columna booleana
spark_df = spark_df.withColumn("exang", col("exang").cast("string"))
spark_df.show(5)

+---+---+------+---------+---------------+--------+-----+-----+--------------+------+-----+-------+-----------+---+-----------------+---+
| id|age|   sex|  dataset|             cp|trestbps| chol|  fbs|       restecg|thalch|exang|oldpeak|      slope| ca|             thal|num|
+---+---+------+---------+---------------+--------+-----+-----+--------------+------+-----+-------+-----------+---+-----------------+---+
|  1| 63|  Male|Cleveland| typical angina|   145.0|233.0| true|lv hypertrophy| 150.0|false|    2.3|downsloping|0.0|     fixed defect|  0|
|  2| 67|  Male|Cleveland|   asymptomatic|   160.0|286.0|false|lv hypertrophy| 108.0| true|    1.5|       flat|3.0|           normal|  2|
|  3| 67|  Male|Cleveland|   asymptomatic|   120.0|229.0|false|lv hypertrophy| 129.0| true|    2.6|       flat|2.0|reversable defect|  1|
|  4| 37|  Male|Cleveland|    non-anginal|   130.0|250.0|false|        normal| 187.0|false|    3.5|downsloping|0.0|           normal|  0|
|  5| 41|Female|Cleveland|atypical

### <font color = #ff4fa4> <i> <b>  It is used to convert categorical variables (string or category values) into numerical indices. Each unique category in the column is assigned a unique integer value. <br> Además colocamos 'handleInvalid' para los null values. </b>

| sex | sex_index |
|-----|-----------|
| male|    0.0    |
| male|    0.0    |
|female|   1.0    |
|female|   1.0    |
| male|    0.0    |

In [11]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler

# 5. Indexar columnas categóricas
categorical_cols = ["sex", "cp", "restecg", "exang", "slope", "thal"]
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index",  handleInvalid="skip").fit(spark_df) for col in categorical_cols]

In [12]:
# Aplicar los indexers al DataFrame
for indexer in indexers:
    spark_df = indexer.transform(spark_df)

In [13]:
pandas_to_demonstrate = spark_df.toPandas()
pandas_to_demonstrate

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,...,slope,ca,thal,num,sex_index,cp_index,restecg_index,exang_index,slope_index,thal_index
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,...,downsloping,0.0,fixed defect,0,0.0,3.0,1.0,0.0,2.0,2.0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,...,flat,3.0,normal,2,0.0,0.0,1.0,1.0,0.0,0.0
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,...,flat,2.0,reversable defect,1,0.0,0.0,1.0,1.0,0.0,1.0
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,...,downsloping,0.0,normal,0,0.0,1.0,0.0,0.0,2.0,0.0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,...,upsloping,0.0,normal,0,1.0,2.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,891,53,Male,VA Long Beach,asymptomatic,124.0,243.0,False,normal,122.0,...,flat,,reversable defect,1,0.0,0.0,0.0,1.0,0.0,1.0
381,892,37,Male,VA Long Beach,non-anginal,118.0,240.0,False,lv hypertrophy,165.0,...,flat,,normal,0,0.0,1.0,1.0,0.0,0.0,0.0
382,893,67,Male,VA Long Beach,asymptomatic,140.0,219.0,False,st-t abnormality,122.0,...,flat,,reversable defect,3,0.0,0.0,2.0,1.0,0.0,1.0
383,903,55,Male,VA Long Beach,asymptomatic,120.0,226.0,False,lv hypertrophy,127.0,...,downsloping,,reversable defect,1,0.0,0.0,1.0,1.0,2.0,1.0


### <font color = #ff4fa4> <i> <b> handleInvalid  = skip if there are null or NaN in DB</b>:

In [14]:
# 6. Continuar con el ensamblado y procesamiento de características como estaba planeado
# Ensamblar características
feature_cols = ["age", "trestbps", "chol", "thalch", "oldpeak"] + [col + "_index" for col in categorical_cols]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip" )
spark_df = assembler.transform(spark_df)

In [15]:
# Verificar el tipo de la columna "features"
print("Tipo de 'features' después de ensamblar:", spark_df.schema["features"].dataType)

Tipo de 'features' después de ensamblar: VectorUDT()


In [16]:
# Verificar si hay valores nulos en las columnas de características
spark_df.select([col for col in feature_cols if spark_df.filter(spark_df[col].isNull()).count() > 0]).show()

++
||
++
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
++
only showing top 20 rows



### <font color = #ff4fa4> <i> <b> If any column in feature_cols contains a null value, the entire row will be dropped. </b>:

In [17]:
# Eliminar filas con valores nulos en las columnas de características
spark_df = spark_df.na.drop(subset=feature_cols)

In [18]:
# Mostrar algunos valores de la columna "features" para ver su contenido
spark_df.select("features").show(5, truncate=False)

+----------------------------------------------------+
|features                                            |
+----------------------------------------------------+
|[63.0,145.0,233.0,150.0,2.3,0.0,3.0,1.0,0.0,2.0,2.0]|
|[67.0,160.0,286.0,108.0,1.5,0.0,0.0,1.0,1.0,0.0,0.0]|
|[67.0,120.0,229.0,129.0,2.6,0.0,0.0,1.0,1.0,0.0,1.0]|
|[37.0,130.0,250.0,187.0,3.5,0.0,1.0,0.0,0.0,2.0,0.0]|
|[41.0,130.0,204.0,172.0,1.4,1.0,2.0,1.0,0.0,1.0,0.0]|
+----------------------------------------------------+
only showing top 5 rows



In [19]:
# Mostrar estadísticas descriptivas para las columnas de características
# Select feature columns and calculate summary statistics
summary_df = spark_df.select(feature_cols).describe()

# Convert the result to a Pandas DataFrame
summary_pandas = summary_df.toPandas()

# Display the Pandas DataFrame
print(summary_pandas)

  summary                 age            trestbps                chol  \
0   count                 384                 384                 384   
1    mean  54.669270833333336          132.234375  209.38802083333334   
2  stddev   8.992163312212886  18.055824823369857  103.31127626694759   
3     min                  29                94.0                 0.0   
4     max                  77               200.0               564.0   

               thalch             oldpeak            sex_index  \
0                 384                 384                  384   
1  142.75520833333334  1.0080729166666667   0.2682291666666667   
2   26.37156618080443  1.1243390561601676  0.44361556238353483   
3                60.0                -2.0                  0.0   
4               202.0                 6.2                  1.0   

             cp_index       restecg_index         exang_index  \
0                 384                 384                 384   
1  0.7135416666666666  0.520833333

### <font color = #ff4fa4> <i> <b> .na.drop() addded since there was an error when running as it was</b>:

In [20]:
# Escalar las características
# Drop rows with NaN in the features column 
spark_df = spark_df.na.drop(subset=["features"])
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
spark_df = scaler.fit(spark_df).transform(spark_df)