In [152]:
import os
import numpy as np
import ast
import re

import pyspark.sql.functions as f
# f.lit
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, StructField, StructType, FloatType, DoubleType, IntegerType

In [153]:
DATA_DIR = "./data/chronic-disease-data"

path = os.path.join(DATA_DIR, "U.S._Chronic_Disease_Indicators__CDI___2023_Release.csv")

spark = SparkSession.builder.appName('test')\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

test_spark_df_00_10 = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(path)

test_spark_df_00_10.show()

+---------+-------+------------+--------------------+----------+------+--------------------+--------+-------------+-------------+---------+------------+-----------------------+-----------------+------------------+-------------------+-----------------------+-------------------+-----------------------+---------------+-----------------------+---------------+--------------------+----------+----------+-------+----------+---------------+-------------------------+-----------------+-------------------------+-----------------+-------------------------+-----------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|DataSource| Topic|            Question|Response|DataValueUnit|DataValueType|DataValue|DataValueAlt|DataValueFootnoteSymbol|DatavalueFootnote|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|StratificationCategory2|Stratification2|StratificationCategory3|Stratification3|         GeoLocation|ResponseID|LocationID|TopicID|QuestionID|DataValueType

# Drop uneccessary columns

In [154]:
cols_to_drop = ["Response",
    "ResponseID",
    "DataValueFootnoteSymbol",
    "DatavalueFootnote",
    
    "DataSource", 
    "DataValue",

    "StratificationCategory2",
    "Stratification2",
    "StratificationCategory3",
    "Stratification3",

    "StratificationCategoryID1",
    "StratificationID1",
    "StratificationCategoryID2",
    "StratificationID2",
    "StratificationCategoryID3",
    "StratificationID3"]
test_spark_df_00_10 = test_spark_df_00_10.drop(*cols_to_drop)
test_spark_df_00_10.show(5)

+---------+-------+------------+--------------------+------+--------------------+-------------+-------------+------------+------------------+-------------------+-----------------------+---------------+--------------------+----------+-------+----------+---------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc| Topic|            Question|DataValueUnit|DataValueType|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|Stratification1|         GeoLocation|LocationID|TopicID|QuestionID|DataValueTypeID|
+---------+-------+------------+--------------------+------+--------------------+-------------+-------------+------------+------------------+-------------------+-----------------------+---------------+--------------------+----------+-------+----------+---------------+
|     2014|   2014|          AR|            Arkansas|Asthma|Hospitalizations ...|         NULL|       Number|       916.0|              NULL|               NULL|                 Gender|        

In [155]:
test_spark_df_00_10.count()

1185676

In [156]:
test_spark_df_00_10 = test_spark_df_00_10.dropDuplicates()
test_spark_df_00_10.show(5)

+---------+-------+------------+------------+-------+--------------------+-----------------+----------------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|YearStart|YearEnd|LocationAbbr|LocationDesc|  Topic|            Question|    DataValueUnit|   DataValueType|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|         GeoLocation|LocationID|TopicID|QuestionID|DataValueTypeID|
+---------+-------+------------+------------+-------+--------------------+-----------------+----------------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|     2012|   2012|          MO|    Missouri|Alcohol|Chronic liver dis...|cases per 100,000|      Crude Rate|         6.4|               5.5|                7.3|                 Gen

In [157]:
test_spark_df_00_10.count()

1185676

# Remove rows with null values either in datavalue, datavalueunit, and datavaluetype

In [158]:
# means thhat if datavalueunit or datavalue or 
# datavaluetype is null then return true and negate it
cond = ~(f.isnull("DataValueUnit") | f.isnull("DataValue") | f.isnull("DataValueType"))
test_spark_df_00_10 = test_spark_df_00_10.filter(cond)
test_spark_df_00_10.show()

+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|         GeoLocation|LocationID|TopicID|QuestionID|DataValueTypeID|
+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|     2015|   2015|          RI|        Rhode Island|Alcohol|Alcohol use among...|            %|Crude Prevalence|        12.6|               7.3|               21.0|    

In [159]:
test_spark_df_00_10.count()

699340

# Extract latitude and longitude from geolocation

In [160]:
test_spark_df_00_10 = test_spark_df_00_10.withColumn("GeoLocation", f.regexp_extract_all(f.col("GeoLocation"), f.lit(r"(-*\d+.\d+)"), 1))
test_spark_df_00_10.show()

+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|         GeoLocation|LocationID|TopicID|QuestionID|DataValueTypeID|
+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+------------+------------------+-------------------+-----------------------+-------------------+--------------------+----------+-------+----------+---------------+
|     2015|   2015|          RI|        Rhode Island|Alcohol|Alcohol use among...|            %|Crude Prevalence|        12.6|               7.3|               21.0|    

# Cast latitude and longitude str columns to doubles

In [161]:
test_spark_df_00_10 = test_spark_df_00_10.withColumn("Latitude", test_spark_df_00_10.GeoLocation[0].cast(DoubleType()))
test_spark_df_00_10 = test_spark_df_00_10.withColumn("Longitude", test_spark_df_00_10.GeoLocation[1].cast(DoubleType()))

# Delete GeoLocation after extracting latitude and longitude as this is redundant

In [162]:
test_spark_df_00_10 = test_spark_df_00_10.drop(*["GeoLocation"])
test_spark_df_00_10.show(5)

+---------+-------+------------+-------------+-------+--------------------+-------------+----------------+------------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|YearStart|YearEnd|LocationAbbr| LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValueAlt|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|
+---------+-------+------------+-------------+-------+--------------------+-------------+----------------+------------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|     2015|   2015|          RI| Rhode Island|Alcohol|Alcohol use among...|            %|Crude Prevalence|        12.6|               7.

# rename datavaluealt column (which is already a double) to just datavalue 

In [163]:
test_spark_df_00_10 = test_spark_df_00_10.withColumnRenamed("DataValueAlt", "DataValue")
test_spark_df_00_10.show(5)

+---------+-------+------------+-------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|YearStart|YearEnd|LocationAbbr| LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValue|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|
+---------+-------+------------+-------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|     2015|   2015|          RI| Rhode Island|Alcohol|Alcohol use among...|            %|Crude Prevalence|     12.6|               7.3|          

# Replace `per 100,000` and `per 100,000 residents` with `cases per 100,000` instead to reduce redundancy

In [164]:
test_spark_df_00_10.select("DataValueUnit").distinct().collect()

[Row(DataValueUnit='Number'),
 Row(DataValueUnit='cases per 1,000,000'),
 Row(DataValueUnit='per 100,000'),
 Row(DataValueUnit='cases per 10,000'),
 Row(DataValueUnit='pack sales per capita'),
 Row(DataValueUnit='cases per 100,000'),
 Row(DataValueUnit='Years'),
 Row(DataValueUnit='%'),
 Row(DataValueUnit='gallons'),
 Row(DataValueUnit='per 100,000 residents'),
 Row(DataValueUnit='cases per 1,000'),
 Row(DataValueUnit='$')]

In [165]:
cases = f.when(f.col("DataValueUnit") == "per 100,000", "cases per 100,000")\
.when(f.col("DataValueUnit") == "per 100,000 residents", "cases per 100,000")\
.otherwise(f.col("DataValueUnit"))
test_spark_df_00_10 = test_spark_df_00_10.withColumn("DataValueUnit", cases)
test_spark_df_00_10.show()

+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValue|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|
+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|     2015|   2015|          RI|        Rhode Island|Alcohol|Alcohol use among...|            %|Crude Prevalence|     12.6| 

In [166]:
test_spark_df_00_10.select("DataValueUnit").distinct().collect()

[Row(DataValueUnit='Number'),
 Row(DataValueUnit='cases per 1,000,000'),
 Row(DataValueUnit='cases per 10,000'),
 Row(DataValueUnit='pack sales per capita'),
 Row(DataValueUnit='cases per 100,000'),
 Row(DataValueUnit='Years'),
 Row(DataValueUnit='%'),
 Row(DataValueUnit='gallons'),
 Row(DataValueUnit='cases per 1,000'),
 Row(DataValueUnit='$')]

# Extract out the age brackets in each question if there are any. 
in previous experimentation the unique age brackets found were the ff.
```
['aged >= 18 years',
 'aged >= 65 years',
 'aged 18-64 years',
 'aged >= 45 years',
 'aged 18-44 years',
 'aged 45-64 years',
 'aged 50-75 years',
 'aged 50-64 years',
 'aged 21-65 years',
 'aged 50-74 years',
 'aged 18-24 years',
 'aged 1-17 years',
 'aged 21-44 years',
 'aged >= 14 years']
```

however other strings that might imply an age bracket would be the youth which is defined by United Nations—without prejudice to any other definitions made by Member States, as the persons between the ages of 18 and 24 years. Others like high school students may according to data from US range from ages 14 to 18, middle school is 11 to 13, and elementary school is 5 to 10



In [167]:
test_spark_df_00_10.select("Question").distinct().collect()

[Row(Question='Cancer of the female cervix, mortality'),
 Row(Question='Soda consumption among high school students'),
 Row(Question='Chronic liver disease mortality'),
 Row(Question='Invasive cancer (all sites combined), incidence'),
 Row(Question='Prevalence of gestational diabetes'),
 Row(Question='Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years'),
 Row(Question='Prevalence of activity limitation among adults >= 18 with diagnosed chronic obstructive pulmonary disease'),
 Row(Question='Per capita alcohol consumption among persons aged >= 14 years'),
 Row(Question='Cancer of the female breast, mortality'),
 Row(Question='Heavy drinking among adults aged >= 18 years'),
 Row(Question='Visits to dentist or dental clinic among adults aged >= 18 years with diagnosed diabetes'),
 Row(Question='Prevalence of current smoking among adults >= 45 years with diagnosed chronic obstructive pulmonary disease'),
 Row(Question='Farmers markets that accept Supplemen

In [168]:
pattern = r"(aged\s*[><=]*\s*\d*-*\d*\s*years|youth|high school student|middle school student|elementary student)"

In [169]:
test_spark_df_00_10.withColumn("Expr", f.regexp_substr(f.col("Question"), f.lit(pattern))).select("Expr").distinct().collect()

[Row(Expr='aged >= 45 years'),
 Row(Expr='aged 50-75 years'),
 Row(Expr='aged 18-24 years'),
 Row(Expr='aged >= 18 years'),
 Row(Expr='aged >= 65 years'),
 Row(Expr='aged 21-44 years'),
 Row(Expr='aged 50-64 years'),
 Row(Expr='youth'),
 Row(Expr='aged 18-44 years'),
 Row(Expr='high school student'),
 Row(Expr='aged 21-65 years'),
 Row(Expr='aged >= 14 years'),
 Row(Expr='aged 45-64 years'),
 Row(Expr='aged 18-64 years'),
 Row(Expr='aged 50-74 years'),
 Row(Expr='aged 1-17 years'),
 Row(Expr=None)]

In [170]:
cond = f.regexp(f.col("Question"), f.lit(pattern))
test_spark_df_w_ages_00_10 = test_spark_df_00_10.filter(cond)
test_spark_df_w_ages_00_10.show()

+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValue|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|
+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+
|     2015|   2015|          RI|        Rhode Island|Alcohol|Alcohol use among...|            %|Crude Prevalence|     12.6| 

In [171]:
test_spark_df_w_ages_00_10.count()

472089

In [172]:
test_spark_df_wo_ages_00_10 = test_spark_df_00_10.filter(~cond)
test_spark_df_wo_ages_00_10.show()

+---------+-------+------------+------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+--------------------+----------+-------+----------+---------------+-------------------+------------------+
|YearStart|YearEnd|LocationAbbr|LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValue|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|     Stratification1|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|
+---------+-------+------------+------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+--------------------+----------+-------+----------+---------------+-------------------+------------------+
|     2015|   2015|          PA|Pennsylvania|Alcohol|Alcohol use befor...|            %|Crude Prevalence|     26.1|              13.0|           

In [173]:
test_spark_df_wo_ages_00_10.count()

227251

In [174]:
test_spark_df_wo_ages_00_10.head()

Row(YearStart=2015, YearEnd=2015, LocationAbbr='PA', LocationDesc='Pennsylvania', Topic='Alcohol', Question='Alcohol use before pregnancy', DataValueUnit='%', DataValueType='Crude Prevalence', DataValue=26.1, LowConfidenceLimit=13.0, HighConfidenceLimit=45.5, StratificationCategory1='Race/Ethnicity', Stratification1='Asian or Pacific Islander', LocationID=42, TopicID='ALC', QuestionID='ALC1_2', DataValueTypeID='CRDPREV', Latitude=-77.86070029399963, Longitude=40.79373015200048)

# Clear the previous dataframe from memory

In [175]:
test_spark_df_00_10.unpersist()

DataFrame[YearStart: int, YearEnd: int, LocationAbbr: string, LocationDesc: string, Topic: string, Question: string, DataValueUnit: string, DataValueType: string, DataValue: double, LowConfidenceLimit: double, HighConfidenceLimit: double, StratificationCategory1: string, Stratification1: string, LocationID: int, TopicID: string, QuestionID: string, DataValueTypeID: string, Latitude: double, Longitude: double]

* variable length array in a column and we want to split these values to multiple columns: https://stackoverflow.com/questions/76995118/pyspark-split-a-column-of-variable-length-array-type-into-two-smaller-arrays


In [176]:
# test_df = spark.createDataFrame([("aged >= 45 years", 1), ("aged > 45 years", 1), ("aged 80-100", 1)], ("Question", "Age"))
# test_df.select("Question").collect()

In [None]:
# what I want to do is when we detect >= return 45 as the age start and float(inf) as the age end as arrays
# when we detect a - in between two numbers we return the two numerical values e.g. 50 and 75 as arrays
# when we detect youth we return 18 and 24 as arrays
# when we detect highschool we return 14 and 18 as arrays
# when our column extracted by f.regexp_extract() has already been built 
# where we have 
# Row(AgeBracket='aged >= 45 years'),
#  Row(AgeBracket='aged 50-75 years'),
#  Row(AgeBracket='aged 18-24 years'),
#  Row(AgeBracket='aged >= 18 years'),
#  Row(AgeBracket='aged >= 65 years'),
#  Row(AgeBracket='aged 21-44 years'),
#  Row(AgeBracket='aged 50-64 years'),
#  Row(AgeBracket='youth'),
#  Row(AgeBracket='aged 18-44 years'),
#  Row(AgeBracket='high school student'),
#  Row(AgeBracket='aged 21-65 years'),
#  Row(AgeBracket='aged >= 14 years'),
#  Row(AgeBracket='aged 45-64 years'),
#  Row(AgeBracket='aged 18-64 years'),
#  Row(AgeBracket='aged 50-74 years'),
#  Row(AgeBracket='aged 1-17 years')



age_info_col = f.regexp_extract(f.col("Question"), pattern, 1)
cases = f.when(
        f.regexp(
            age_info_col,
            # this is where we check if a column has >=, <=, >, <, 
            # then we return whatever number is in this as a list
            # and then cast this list of matched string numbers to
            # a list of int numbers  
            f.lit(r"([><=]+)")
        ),
        f.regexp_extract_all(
            age_info_col,
            f.lit(r"(\d+)"), 
            1
        ).cast(ArrayType(FloatType()))
    ).when(
        f.regexp(
            age_info_col,
            f.lit(r"((?<=\d+)-+(?=\d+))")
        ),
        f.regexp_extract_all(
            age_info_col, 
            f.lit(r"(\d+)"), 
            1
        ).cast(ArrayType(FloatType()))
    ).when(
        f.regexp(
            age_info_col,
            f.lit(r"(youth)")
        ),
        # when youth is detected in the age info column we
        # regexp will return true and when a row is true we return
        # in this case an array of literal/constant float values of 
        # 18 and 24 as these are the age ranges of this group 
        f.array(f.lit(float(18)), f.lit(float(24)))
    ).otherwise(
        # and lastly in a case where no age bracket numbers or youth
        # keywords are detected it is assumed that this group is highschool
        # which has an age range of 14 to 18 
        f.array(f.lit(float(14)), f.lit(float(18)))
    )

# # this is to return all numbers in a list if a
# # >=, <=, <, or > is detected
# expr = f.regexp_extract_all(
#     `f.regexp_extract(f.col("Question"), pattern, 1),`
#     f.lit(r"(\d+)"),
#     1
# )

# to detect whether a <=, >=, >, or < is present in column
arith = f.regexp_substr(
    f.regexp_extract(f.col("Question"), pattern, 1),
    # f.lit(r"([><=]+)")
    # f.lit(r"((?<=\d+)-+(?=\d+))")
    f.lit(r"(youth)")
)

# create a dataframe with column AgeBracket and 
# select only the AgeBracket column
test_spark_df_w_ages_00_10 = test_spark_df_w_ages_00_10.withColumn(
    "AgeStart",
    # expr
    cases[0]
    # arith
).withColumn("AgeEnd", cases[1])
test_spark_df_w_ages_00_10.show()

+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+----------+--------+------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValue|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|AgeBracket|AgeStart|AgeEnd|
+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+----------+--------+------+
|     2015|   2015|          RI|        Rho

# Since there are null values in the `AgeEnd` column because the number extracted from the `AgeBracket` column was only a single value and not a range with two values we fill this null value with a `float("inf")` value as this indicates that the `AgeBracket` was **from a certain age and above**

In [200]:
test_spark_df_w_ages_00_10 = test_spark_df_w_ages_00_10.fillna(float("inf"), subset=["AgeEnd"])
test_spark_df_w_ages_00_10.show()

+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+----------+--------+------+
|YearStart|YearEnd|LocationAbbr|        LocationDesc|  Topic|            Question|DataValueUnit|   DataValueType|DataValue|LowConfidenceLimit|HighConfidenceLimit|StratificationCategory1|    Stratification1|LocationID|TopicID|QuestionID|DataValueTypeID|           Latitude|         Longitude|AgeBracket|AgeStart|AgeEnd|
+---------+-------+------------+--------------------+-------+--------------------+-------------+----------------+---------+------------------+-------------------+-----------------------+-------------------+----------+-------+----------+---------------+-------------------+------------------+----------+--------+------+
|     2015|   2015|          RI|        Rho

In [201]:
test_spark_df_w_ages_00_10.select("AgeStart", "AgeEnd").distinct().collect()

[Row(AgeStart=1.0, AgeEnd=17.0),
 Row(AgeStart=21.0, AgeEnd=65.0),
 Row(AgeStart=65.0, AgeEnd=inf),
 Row(AgeStart=18.0, AgeEnd=inf),
 Row(AgeStart=18.0, AgeEnd=44.0),
 Row(AgeStart=18.0, AgeEnd=64.0),
 Row(AgeStart=50.0, AgeEnd=74.0),
 Row(AgeStart=45.0, AgeEnd=64.0),
 Row(AgeStart=14.0, AgeEnd=18.0),
 Row(AgeStart=14.0, AgeEnd=inf),
 Row(AgeStart=21.0, AgeEnd=44.0),
 Row(AgeStart=45.0, AgeEnd=inf),
 Row(AgeStart=50.0, AgeEnd=75.0),
 Row(AgeStart=50.0, AgeEnd=64.0),
 Row(AgeStart=18.0, AgeEnd=24.0)]

# lastly for the stratification column we will have the following unique values like `Male`, `Overall`, `Female`, `Asian or Pacific Islander`, `White, non-Hispanic`, `Hispanic`, `American Indian or Alaska Native`, `Black, non-Hispanic`, `Asian, non-Hispanic`, `Other, non-Hispanic`, `Multiracial, non-Hispanic` which we will need to separate further into sex and origin e.g. 
```
| Sex | Origin |
| Male | Hispanic |
| Male | not Hispanic | 
| Overall | 
| Female | Hispanic |
| Female | not Hispanic |
| Asian or Pacific Islander | Hispanic |
| Asian or Pacific Islander | not Hispanic |
| White | Hispanic |
| White | not Hispanic |
| American Indian or Alaska Native | Hispanic |
| American Indian or Alaska Native | not Hispanic |
| Black | Hispanic |
| Black | not Hispanic |
| Asian | Hispanic |
| Asian | not Hispanic |
| Other | Hispanic |
| Other | not Hispanic |
| Multiracial | Hispanic |
| Multiracial | non hispanic |
```

In [202]:
test_spark_df_w_ages_00_10.select("StratificationCategory1", "Stratification1").distinct().collect()

[Row(StratificationCategory1='Overall', Stratification1='Overall'),
 Row(StratificationCategory1='Race/Ethnicity', Stratification1='White, non-Hispanic'),
 Row(StratificationCategory1='Gender', Stratification1='Male'),
 Row(StratificationCategory1='Race/Ethnicity', Stratification1='Hispanic'),
 Row(StratificationCategory1='Race/Ethnicity', Stratification1='American Indian or Alaska Native'),
 Row(StratificationCategory1='Race/Ethnicity', Stratification1='Asian, non-Hispanic'),
 Row(StratificationCategory1='Race/Ethnicity', Stratification1='Asian or Pacific Islander'),
 Row(StratificationCategory1='Race/Ethnicity', Stratification1='Black, non-Hispanic'),
 Row(StratificationCategory1='Race/Ethnicity', Stratification1='Other, non-Hispanic'),
 Row(StratificationCategory1='Race/Ethnicity', Stratification1='Multiracial, non-Hispanic'),
 Row(StratificationCategory1='Gender', Stratification1='Female')]

In [198]:
test_spark_df_w_ages_00_10.dtypes

[('YearStart', 'int'),
 ('YearEnd', 'int'),
 ('LocationAbbr', 'string'),
 ('LocationDesc', 'string'),
 ('Topic', 'string'),
 ('Question', 'string'),
 ('DataValueUnit', 'string'),
 ('DataValueType', 'string'),
 ('DataValue', 'double'),
 ('LowConfidenceLimit', 'double'),
 ('HighConfidenceLimit', 'double'),
 ('StratificationCategory1', 'string'),
 ('Stratification1', 'string'),
 ('LocationID', 'int'),
 ('TopicID', 'string'),
 ('QuestionID', 'string'),
 ('DataValueTypeID', 'string'),
 ('Latitude', 'double'),
 ('Longitude', 'double'),
 ('AgeBracket', 'int'),
 ('AgeStart', 'double'),
 ('AgeEnd', 'double')]

In [199]:
test_spark_df_wo_ages_00_10.dtypes

[('YearStart', 'int'),
 ('YearEnd', 'int'),
 ('LocationAbbr', 'string'),
 ('LocationDesc', 'string'),
 ('Topic', 'string'),
 ('Question', 'string'),
 ('DataValueUnit', 'string'),
 ('DataValueType', 'string'),
 ('DataValue', 'double'),
 ('LowConfidenceLimit', 'double'),
 ('HighConfidenceLimit', 'double'),
 ('StratificationCategory1', 'string'),
 ('Stratification1', 'string'),
 ('LocationID', 'int'),
 ('TopicID', 'string'),
 ('QuestionID', 'string'),
 ('DataValueTypeID', 'string'),
 ('Latitude', 'double'),
 ('Longitude', 'double')]

# Errors
* Out of memory error: 
- https://stackoverflow.com/questions/73111729/pyspark-java-heap-out-of-memory-when-saving-5m-rows-dataframe
- https://medium.com/@rakeshchanda/spark-out-of-memory-issue-memory-tuning-and-management-in-pyspark-802b757b562f
- https://stackoverflow.com/questions/21138751/spark-java-lang-outofmemoryerror-java-heap-space
* EOF errror: 