In [170]:
import os
import numpy as np
import ast
import re

import pyspark.sql.functions as f
# f.lit
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, StructField, StructType, FloatType, DoubleType

In [171]:
DATA_DIR = "./data/chronic-disease-data"

path = os.path.join(DATA_DIR, "U.S._Chronic_Disease_Indicators__CDI___2023_Release.csv")

spark = SparkSession.builder.appName('test')\
    .config("spark.executor.memory", "16g")\
    .getOrCreate()

test_spark_df_00_10 = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(path)

test_spark_df_00_10.show()

+---------+-------+------------+--------------------+----------+------+--------------------+--------+-------------+-------------+---------+------------+-----------------------+-----------------+------------------+-------------------+-----------------------+-------------------+-----------------------+---------------+-----------------------+---------------+--------------------+----------+----------+-------+----------+---------------+-------------------------+-----------------+-------------------------+-----------------+-------------------------+-----------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|DataSource| Topic|            Question|Response|DataValueUnit|DataValueType|DataValue|DataValueAlt|DataValueFootnoteSymbol|DatavalueFootnote|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|StratificationCategory2|Stratification2|StratificationCategory3|Stratification3|         GeoLocation|ResponseID|LocationID|TopicID|QuestionID|DataValueType

In [172]:
cols_to_drop = ["Response",
    "ResponseID",
    "DataValueFootnoteSymbol",
    "DatavalueFootnote",

    "StratificationCategory2",
    "Stratification2",
    "StratificationCategory3",
    "Stratification3",

    "StratificationCategoryID1",
    "StratificationID1",
    "StratificationCategoryID2",
    "StratificationID2",
    "StratificationCategoryID3",
    "StratificationID3"]
test_spark_df_00_10 = test_spark_df_00_10.drop(*cols_to_drop)
test_spark_df_00_10.show()

+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|DataSource| Topic|            Question|DataValueUnit|DataValueType|DataValue|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|         GeoLocation|LocationID|TopicID|QuestionID|DataValueTypeID|
+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|     2014|   2014|          AR|            Arkansas| SEDD; SID|Asthma|Hospitalizations ...|         NULL|       Numbe

In [173]:
test_spark_df_00_10.dtypes

[('YearStart', 'int'),
 ('YearEnd', 'int'),
 ('LocationAbbr', 'string'),
 ('LocationDesc', 'string'),
 ('DataSource', 'string'),
 ('Topic', 'string'),
 ('Question', 'string'),
 ('DataValueUnit', 'string'),
 ('DataValueType', 'string'),
 ('DataValue', 'string'),
 ('DataValueAlt', 'double'),
 ('LowConfidenceLimit', 'double'),
 ('HighConfidenceLimit', 'double'),
 ('StratificationCategory1', 'string'),
 ('Stratification1', 'string'),
 ('GeoLocation', 'string'),
 ('LocationID', 'int'),
 ('TopicID', 'string'),
 ('QuestionID', 'string'),
 ('DataValueTypeID', 'string')]

In [174]:
test_spark_df_00_10.count()

1185676

In [175]:
test_spark_df_00_10.tail(5)

[Row(YearStart=2020, YearEnd=2020, LocationAbbr='WY', LocationDesc='Wyoming', DataSource='BRFSS', Topic='Diabetes', Question='Dilated eye examination among adults aged >= 18 years with diagnosed diabetes', DataValueUnit='%', DataValueType='Age-adjusted Prevalence', DataValue=None, DataValueAlt=None, LowConfidenceLimit=None, HighConfidenceLimit=None, StratificationCategory1='Race/Ethnicity', Stratification1='White, non-Hispanic', GeoLocation='POINT (-108.10983035299967 43.23554134300048)', LocationID=56, TopicID='DIA', QuestionID='DIA7_0', DataValueTypeID='AGEADJPREV'),
 Row(YearStart=2020, YearEnd=2020, LocationAbbr='WY', LocationDesc='Wyoming', DataSource='BRFSS', Topic='Older Adults', Question='Proportion of older adults aged >= 65 years who are up to date on a core set of clinical preventive services', DataValueUnit='%', DataValueType='Crude Prevalence', DataValue='41.5', DataValueAlt=41.5, LowConfidenceLimit=38.5, HighConfidenceLimit=44.6, StratificationCategory1='Race/Ethnicity', 

In [176]:
test_spark_df_00_10.head(5)

[Row(YearStart=2014, YearEnd=2014, LocationAbbr='AR', LocationDesc='Arkansas', DataSource='SEDD; SID', Topic='Asthma', Question='Hospitalizations for asthma', DataValueUnit=None, DataValueType='Number', DataValue='916', DataValueAlt=916.0, LowConfidenceLimit=None, HighConfidenceLimit=None, StratificationCategory1='Gender', Stratification1='Male', GeoLocation='POINT (-92.27449074299966 34.74865012400045)', LocationID=5, TopicID='AST', QuestionID='AST3_1', DataValueTypeID='NMBR'),
 Row(YearStart=2018, YearEnd=2018, LocationAbbr='CO', LocationDesc='Colorado', DataSource='SEDD; SID', Topic='Asthma', Question='Hospitalizations for asthma', DataValueUnit=None, DataValueType='Number', DataValue='2227', DataValueAlt=2227.0, LowConfidenceLimit=None, HighConfidenceLimit=None, StratificationCategory1='Overall', Stratification1='Overall', GeoLocation='POINT (-106.13361092099967 38.843840757000464)', LocationID=8, TopicID='AST', QuestionID='AST3_1', DataValueTypeID='NMBR'),
 Row(YearStart=2018, Yea

In [177]:
test_spark_df_00_10.show(5)

+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+---------------+--------------------+----------+-------+----------+---------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|DataSource| Topic|            Question|DataValueUnit|DataValueType|DataValue|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|Stratification1|         GeoLocation|LocationID|TopicID|QuestionID|DataValueTypeID|
+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+---------------+--------------------+----------+-------+----------+---------------+
|     2014|   2014|          AR|            Arkansas| SEDD; SID|Asthma|Hospitalizations ...|         NULL|       Number|      916|

In [178]:
# test_spark_df_00_10.select(f.regexp_extract_all(f.col("GeoLocation"), r"(-*\d+.\d+)", 2).alias("lat_long")).collect()

In [179]:
test_spark_df_00_10 = test_spark_df_00_10.withColumn("GeoLocation", f.regexp_extract_all(f.col("GeoLocation"), f.lit("(-*\\d+.\\d+)"), 1))
test_spark_df_00_10.show()

+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|DataSource| Topic|            Question|DataValueUnit|DataValueType|DataValue|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|         GeoLocation|LocationID|TopicID|QuestionID|DataValueTypeID|
+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|     2014|   2014|          AR|            Arkansas| SEDD; SID|Asthma|Hospitalizations ...|         NULL|       Numbe

In [180]:
test_spark_df_00_10 = test_spark_df_00_10.withColumn("Latitude", test_spark_df_00_10.GeoLocation[0].cast(DoubleType()))
test_spark_df_00_10 = test_spark_df_00_10.withColumn("Longitude", test_spark_df_00_10.GeoLocation[1].cast(DoubleType()))

In [181]:
test_spark_df_00_10.show(5)

+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+---------------+--------------------+----------+-------+----------+---------------+-------------------+------------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|DataSource| Topic|            Question|DataValueUnit|DataValueType|DataValue|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|Stratification1|         GeoLocation|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|
+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+---------------+--------------------+----------+-------+----------+---------------+-------------------+------------------+
|     2014|  

# Delete GeoLocation column

In [182]:
test_spark_df_00_10 = test_spark_df_00_10.drop("GeoLocation")
test_spark_df_00_10.show()

+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|DataSource| Topic|            Question|DataValueUnit|DataValueType|DataValue|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|
+---------+-------+------------+--------------------+----------+------+--------------------+-------------+-------------+---------+------------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|     2014|   2014|          AR|            Arkansas| SEDD; SID|

In [183]:
test_spark_df_00_10.dtypes

[('YearStart', 'int'),
 ('YearEnd', 'int'),
 ('LocationAbbr', 'string'),
 ('LocationDesc', 'string'),
 ('DataSource', 'string'),
 ('Topic', 'string'),
 ('Question', 'string'),
 ('DataValueUnit', 'string'),
 ('DataValueType', 'string'),
 ('DataValue', 'string'),
 ('DataValueAlt', 'double'),
 ('LowConfidenceLimit', 'double'),
 ('HighConfidenceLimit', 'double'),
 ('StratificationCategory1', 'string'),
 ('Stratification1', 'string'),
 ('LocationID', 'int'),
 ('TopicID', 'string'),
 ('QuestionID', 'string'),
 ('DataValueTypeID', 'string'),
 ('Latitude', 'double'),
 ('Longitude', 'double')]

# Errors
* Out of memory error: 
- https://stackoverflow.com/questions/73111729/pyspark-java-heap-out-of-memory-when-saving-5m-rows-dataframe
- https://medium.com/@rakeshchanda/spark-out-of-memory-issue-memory-tuning-and-management-in-pyspark-802b757b562f
- https://stackoverflow.com/questions/21138751/spark-java-lang-outofmemoryerror-java-heap-space
* EOF errror: 