In [2]:
// 1. Déclaration de la case class
case class AcsPatient(
  rownames:          Int,
  age:               Int,
  sex:               String,
  cardiogenicShock:  String,
  entry:             String,   // Radial / Femoral
  Dx:                String,   // STEMI / NSTEMI / UA
  EF:                Double,   // Fraction d'éjection
  height:            Double,   // cm
  weight:            Double,   // kg
  BMI:               Double,
  obesity:           String,   // Yes / No
  TC:                Double,   // Total Cholesterol
  LDLC:              Double,
  HDLC:              Double,
  TG:                Double,   // Triglycerides
  DM:                String,   // Diabetes mellitus
  HBP:               String,   // High blood pressure
  smoking:           String    // Smoker / Ex-smoker / Never
)

defined class AcsPatient


In [3]:
// 2. Chargement dans Databricks (ou Codespaces + spark-standalone)
val rawPath = "acs_patients_df.csv"

rawPath = acs_patients_df.csv


acs_patients_df.csv

In [4]:
// Lecture directe en DataFrame puis mapping vers Dataset[AcsPatient]
import org.apache.spark.sql.Encoders
val schema = Encoders.product[AcsPatient].schema

val patientsDS = spark.read
  .schema(schema)            // impose les types pour éviter les inférences erronées
  .option("header", "true")
  .option("nullValue", "NA") // gère les cellules vides
  .csv(rawPath)
  .as[AcsPatient]

schema = StructType(StructField(rownames,IntegerType,false),StructField(age,IntegerType,false),StructField(sex,StringType,true),StructField(cardiogenicShock,StringType,true),StructField(entry,StringType,true),StructField(Dx,StringType,true),StructField(EF,DoubleType,false),StructField(height,DoubleType,false),StructField(weight,DoubleType,false),StructField(BMI,DoubleType,false),StructField(obesity,StringType,true),StructField(TC,DoubleType,false),StructField(LDLC,DoubleType,false),StructField(HDLC,DoubleType,false),StructField(TG,DoubleType,false),StructField(DM,StringType,true),StructField(HBP,StringType,true),StructField(smoking,StringType,true))
patientsDS = [rowname...


[rowname...

In [5]:
// 3. Premier aperçu (équivalent du sc.textFile(...).take(10))
patientsDS.show(10, truncate = false)

// 4. Exemple : calcul du taux de STEMI par sexe
import org.apache.spark.sql.functions._
val stemiRateBySex = patientsDS.groupBy('sex)
  .agg( round( avg( when('Dx === "STEMI", 1).otherwise(0) )*100, 1).alias("STEMI_%") )

stemiRateBySex.orderBy('sex).show()

+--------+---+------+----------------+-------+---------------+----+------+------+-----------+-------+-----+-----+----+-----+---+---+---------+
|rownames|age|sex   |cardiogenicShock|entry  |Dx             |EF  |height|weight|BMI        |obesity|TC   |LDLC |HDLC|TG   |DM |HBP|smoking  |
+--------+---+------+----------------+-------+---------------+----+------+------+-----------+-------+-----+-----+----+-----+---+---+---------+
|1       |62 |Male  |No              |Femoral|STEMI          |18.0|168.0 |72.0  |25.51020408|Yes    |215.0|154.0|35.0|155.0|Yes|No |Smoker   |
|2       |78 |Female|No              |Femoral|STEMI          |18.4|148.0 |48.0  |21.9138057 |No     |NULL |NULL |NULL|166.0|No |Yes|Never    |
|3       |76 |Female|Yes             |Femoral|STEMI          |20.0|NULL  |NULL  |NULL       |No     |NULL |NULL |NULL|NULL |No |Yes|Never    |
|4       |89 |Female|No              |Femoral|STEMI          |21.8|165.0 |50.0  |18.36547291|No     |121.0|73.0 |20.0|89.0 |No |No |Never    |

stemiRateBySex = [sex: string, STEMI_%: double]


[sex: string, STEMI_%: double]