### Pontificia Universidad Javeriana

**Autor**: Cesar Beltran

**fecha**: 5-nov-2024

**Cuaderno**: Primer Laboratorio Spark

**Tema**: Limpieza de Datos y prediccion usando **PySpark**
    

Se instalab librerias necesarias para:
-spark
-numpy
-seaborn
-matplotlib
-scikt-learn
-squarify


In [1]:
!pip install numpy
!pip install pyspark
!pip install seaborn
!pip install findspark
!pip install matplotlib



In [2]:
### se invocan bibliotecas
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyspark

from pyspark.context import SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import input_file_name, mean, col, split, regexp_extract, when, lit, isnan, count
from pyspark import SparkFiles

from pyspark.conf import SparkConf
import os 


### Se configura Entorno PySpark

In [3]:
SPARK_MASTER_URL = os.getenv("SPARK_MASTER_URL","spark://dos01:7077")
configura=SparkConf()
configura.setMaster(SPARK_MASTER_URL)
configura.set('spark.local.dir','/almacen/TrabajosSpark/')
configura.setAppName("Proyecto00_Spark_Stroke_Beltran")

configura.set('spark.cores.max',6)
configura.set('spark.executor.cores',6)

spark = SparkSession.builder.config(conf=configura).getOrCreate()
SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)

print("Sesion creada: HPC")
spark

/almacen/Spark/conf/spark-env.sh: línea 49: SPARK_MASTER_HOST: orden no encontrada
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/12 19:51:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/12 19:51:55 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
24/11/12 19:51:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/11/12 19:51:55 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/11/12 19:51:55 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/11/12 19:51:55 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/11/12 19:51:55 WARN Utils: Service 'SparkUI' cou

Sesion creada: HPC


In [8]:
###Dataset: https://github.com/corredor-john/ExploratoryDataAnalisys/blob/main/Varios/stroke_pyspark.csv


url ="stroke_pyspark.csv"
df00Stroke00 = spark.read.csv(url, header=True, inferSchema=True)

df00Stroke00.show(2)




                                                                                

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
only showing top 2 rows



In [6]:
df00Stroke00.columns

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke']

In [11]:
### Tipo de datos
df00Stroke00.printSchema()

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [17]:
### Se requiere cambiar los nombres de las Columnas
nuevosNombres = ['ID','Genero', 'Edad', 'Hipertension', 'Enfermedad', 'Casado', 'Trabajo', 'Tipo_Residencia', 
                 'Nivel_Prom_GlucosA', 'IMC', 'Fumador', 'Paro_Cardiaco']
df00Stroke01= df00Stroke00
###Bucle para cambio de nombre

for antes, nuevo in zip(df00Stroke01.columns, nuevosNombres):
        df00Stroke01= df00Stroke01.withColumnRenamed(antes, nuevo)


df00Stroke01.columns

['ID',
 'Genero',
 'Edad',
 'Hipertension',
 'Enfermedad',
 'Casado',
 'Trabajo',
 'Tipo_Residencia',
 'Nivel_Prom_GlucosA',
 'IMC',
 'Fumador',
 'Paro_Cardiaco']

In [24]:
### Se requiere cambiar los nombres de las Columnas

df00Stroke01=df00Stroke01.withColumn('IMC', df00Stroke01.IMC.cast(DoubleType()))
df00Stroke01.printSchema()


root
 |-- ID: integer (nullable = true)
 |-- Genero: string (nullable = true)
 |-- Edad: double (nullable = true)
 |-- Hipertension: integer (nullable = true)
 |-- Enfermedad: integer (nullable = true)
 |-- Casado: string (nullable = true)
 |-- Trabajo: string (nullable = true)
 |-- Tipo_Residencia: string (nullable = true)
 |-- Nivel_Prom_GlucosA: double (nullable = true)
 |-- IMC: double (nullable = true)
 |-- Fumador: string (nullable = true)
 |-- Paro_Cardiaco: integer (nullable = true)



In [27]:
### pend foto

df00Stroke01.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df00Stroke01.columns]).show()

[Stage 8:>                                                          (0 + 1) / 1]

+---+------+----+------------+----------+------+-------+---------------+------------------+---+-------+-------------+
| ID|Genero|Edad|Hipertension|Enfermedad|Casado|Trabajo|Tipo_Residencia|Nivel_Prom_GlucosA|IMC|Fumador|Paro_Cardiaco|
+---+------+----+------------+----------+------+-------+---------------+------------------+---+-------+-------------+
|  0|     0|   0|           0|         0|     0|      0|              0|                 0|201|      0|            0|
+---+------+----+------------+----------+------+-------+---------------+------------------+---+-------+-------------+



                                                                                

In [26]:
df00Stroke01.count()

5110

In [28]:
nulos= (201/df00Stroke01.count())*100
print(f"Se tiene una porción de {round(nulos,2)}%")

Se tiene una porción de 3.93%


In [42]:
## Se observa las categorias Género
df00Stroke01.groupby(["Genero"]).count().show()




+------+-----+
|Genero|count|
+------+-----+
|Female| 2994|
| Other|    1|
|  Male| 2115|
+------+-----+



In [39]:

##Se elimina el Other en los Datos
df00Stroke02= df00Stroke01.where("Genero <> 'Other'")
df00Stroke02.groupby(['Genero']).count().show()


+------+-----+
|Genero|count|
+------+-----+
|Female| 2994|
|  Male| 2115|
+------+-----+



In [46]:
###Se estrarifica por edades y por genero, de manera que cada 10 años se saca el promedio  IMC y por genero.
AvIMC_0_10 = df00Stroke02.where((col("Genero") == lit("Female")) & (col("Edad")<10)).select(mean(col("IMC"))).collect()[0][0]
print(AvIMC_0_10)

[Row(avg(IMC)=18.687962962962963)]


In [54]:
### SE hace la funcion para el cambio   PENDIENTE!!!
def cambioPromedio(df, col01, catGenero, col02, minEdad, maxEdad, col03):
    prom = df.where((col(col01) == lit(cat(catGenero)) & (col(col02) > minEdad) & (col(col02) < maxEdad)).select(mean(col(col03))).collect()[0][0]
    
    print(prom)
    dfResul = df.withColumn(col03, when((df([col01] == catGenero) &df[col03].isNull()) & (df[col02]<maxEdad)), prom).otherwise(df[col03]))
    return dfResul


    

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3849571717.py, line 3)

In [49]:
df00Stroke03=df00Stroke02
for promedio in range(0,100,10):
    df00Stroke03=cambioPromedio(df00Stroke2, "Genero", "Female", "Edad", promedio, promedio+10, "IMC")
        
    print(promedio, promedio+10)



NameError: name 'cambioPromedio' is not defined