In [27]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, floor, rand, col, round

spark = SparkSession.builder \
    .appName("Notebook Spark Session") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")  # or INFO, ERROR, etc.
    


In [8]:

# Read the CSV files
appointments_df = spark.read.csv('sample_data/appointment_data.csv', header=True, inferSchema=True)
appointments_df.show()

+----------+----------------+-----------------+
|patient_id|appointment_date|           doctor|
+----------+----------------+-----------------+
|       236|      2024-05-17|     Morgan Baker|
|       225|      2024-08-03|   Vincent Wright|
|       831|      2024-07-19|      Joshua Ford|
|       116|      2024-03-02|    Michelle Hill|
|       433|      2024-01-17|       Kari Morse|
|       287|      2024-08-19|      Oscar Baker|
|       729|      2024-07-15|   George Estrada|
|       554|      2024-05-09|      James Baker|
|        90|      2024-05-07|    Vickie Obrien|
|       277|      2024-02-14|   Jennifer Silva|
|       261|      2024-08-23|     Andrew Brown|
|       510|      2024-03-06|      Trevor Hall|
|       898|      2024-05-23|   Dorothy Obrien|
|       534|      2024-06-17|   Nicole Terrell|
|       562|      2024-03-18|  Rebecca Ramirez|
|       343|      2024-05-02|     Denise Walsh|
|       566|      2024-03-19|       Nancy Ford|
|       447|      2024-07-08|       Josh

In [9]:
sorted_df = appointments_df.orderBy('patient_id')
sorted_df.show()

+----------+----------------+------------------+
|patient_id|appointment_date|            doctor|
+----------+----------------+------------------+
|         3|      2024-05-12|  Kristina Collins|
|         4|      2024-08-17|     Felicia Moore|
|         5|      2024-03-20|  Eugene Wilson MD|
|         5|      2024-05-08|      Jason Taylor|
|         6|      2024-08-23|    Jessica Martin|
|         8|      2024-01-18|       Erin Castro|
|        10|      2024-09-08|        Mary Ortiz|
|        11|      2024-04-17|Sherri Fitzpatrick|
|        12|      2024-01-03|  Richard Martinez|
|        12|      2024-07-04|         Joy Price|
|        16|      2024-04-26|        Stacy Leon|
|        17|      2024-08-24|  Cheyenne Watkins|
|        17|      2024-04-06|        Ryan Davis|
|        17|      2024-07-02|  Joshua Rodriguez|
|        18|      2024-03-29|  Kristine Robbins|
|        18|      2024-06-26|    Matthew Lawson|
|        19|      2024-01-08|      Julia Lawson|
|        20|      20

In [14]:
sorted_df = sorted_df.withColumn('hours', lit(20))
sorted_df.show(10)

+----------+----------------+------------------+-----+
|patient_id|appointment_date|            doctor|hours|
+----------+----------------+------------------+-----+
|         3|      2024-05-12|  Kristina Collins|   20|
|         4|      2024-08-17|     Felicia Moore|   20|
|         5|      2024-03-20|  Eugene Wilson MD|   20|
|         5|      2024-05-08|      Jason Taylor|   20|
|         6|      2024-08-23|    Jessica Martin|   20|
|         8|      2024-01-18|       Erin Castro|   20|
|        10|      2024-09-08|        Mary Ortiz|   20|
|        11|      2024-04-17|Sherri Fitzpatrick|   20|
|        12|      2024-01-03|  Richard Martinez|   20|
|        12|      2024-07-04|         Joy Price|   20|
+----------+----------------+------------------+-----+
only showing top 10 rows



In [20]:
df_with_weight = sorted_df.withColumn('weight_pounds', floor(rand() * (250 - 120 + 1) + 120))
df_with_weight.show(20)

+----------+----------------+------------------+-----+-------------+
|patient_id|appointment_date|            doctor|hours|weight_pounds|
+----------+----------------+------------------+-----+-------------+
|         3|      2024-05-12|  Kristina Collins|   20|          201|
|         4|      2024-08-17|     Felicia Moore|   20|          200|
|         5|      2024-03-20|  Eugene Wilson MD|   20|          165|
|         5|      2024-05-08|      Jason Taylor|   20|          123|
|         6|      2024-08-23|    Jessica Martin|   20|          246|
|         8|      2024-01-18|       Erin Castro|   20|          163|
|        10|      2024-09-08|        Mary Ortiz|   20|          211|
|        11|      2024-04-17|Sherri Fitzpatrick|   20|          229|
|        12|      2024-01-03|  Richard Martinez|   20|          149|
|        12|      2024-07-04|         Joy Price|   20|          140|
|        16|      2024-04-26|        Stacy Leon|   20|          177|
|        17|      2024-08-24|  Che

In [23]:
df_with_height = df_with_weight.withColumn('height_cm', floor(rand() * (200 - 170 + 1) + 170))
df_with_height.show(20)

+----------+----------------+------------------+-----+-------------+---------+
|patient_id|appointment_date|            doctor|hours|weight_pounds|height_cm|
+----------+----------------+------------------+-----+-------------+---------+
|         3|      2024-05-12|  Kristina Collins|   20|          201|      178|
|         4|      2024-08-17|     Felicia Moore|   20|          200|      184|
|         5|      2024-03-20|  Eugene Wilson MD|   20|          165|      178|
|         5|      2024-05-08|      Jason Taylor|   20|          123|      183|
|         6|      2024-08-23|    Jessica Martin|   20|          246|      173|
|         8|      2024-01-18|       Erin Castro|   20|          163|      171|
|        10|      2024-09-08|        Mary Ortiz|   20|          211|      183|
|        11|      2024-04-17|Sherri Fitzpatrick|   20|          229|      175|
|        12|      2024-01-03|  Richard Martinez|   20|          149|      172|
|        12|      2024-07-04|         Joy Price|   2

In [28]:
df_with_bmi = df_with_height.withColumn(
    "bmi",
    round((col("weight_pounds") * 0.45359237) / ((col("height_cm") / 100) ** 2), 2)
)
df_with_bmi.show(20)

+----------+----------------+------------------+-----+-------------+---------+-----+
|patient_id|appointment_date|            doctor|hours|weight_pounds|height_cm|  bmi|
+----------+----------------+------------------+-----+-------------+---------+-----+
|         3|      2024-05-12|  Kristina Collins|   20|          201|      178|28.78|
|         4|      2024-08-17|     Felicia Moore|   20|          200|      184| 26.8|
|         5|      2024-03-20|  Eugene Wilson MD|   20|          165|      178|23.62|
|         5|      2024-05-08|      Jason Taylor|   20|          123|      183|16.66|
|         6|      2024-08-23|    Jessica Martin|   20|          246|      173|37.28|
|         8|      2024-01-18|       Erin Castro|   20|          163|      171|25.28|
|        10|      2024-09-08|        Mary Ortiz|   20|          211|      183|28.58|
|        11|      2024-04-17|Sherri Fitzpatrick|   20|          229|      175|33.92|
|        12|      2024-01-03|  Richard Martinez|   20|          1

In [31]:
df_with_bmi.coalesce(1).write.csv('sample_data/transformed_appointment_data', header=True, mode='overwrite')
