In [0]:

from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import date, timedelta
     

In [0]:
input_path='/Volumes/earth_data/bronze/operationaldata/US_Counties_Centroids.csv'
# Load the JSON data into a Spark DataFrame
us_counties_df = spark.read \
.option('header', True) \
.csv(input_path)

In [0]:

# slect counties  data
us_counties_selected = (
    us_counties_df
    .select(
        col('name').alias('county_name'),
        col('STATE').alias('state_name_Short'),
        col('LATITUDE').cast('double').alias('latitude'),
        col('LONGITUDE').cast('double').alias('longitude')
    )
)
    

In [0]:
# State abbreviation mapping (define once at module level)
STATE_ABBREV_TO_FULL = {
    'AL': 'ALABAMA', 'AK': 'ALASKA', 'AZ': 'ARIZONA', 'AR': 'ARKANSAS',
    'CA': 'CALIFORNIA', 'CO': 'COLORADO', 'CT': 'CONNECTICUT', 'DE': 'DELAWARE',
    'FL': 'FLORIDA', 'GA': 'GEORGIA', 'HI': 'HAWAII', 'ID': 'IDAHO',
    'IL': 'ILLINOIS', 'IN': 'INDIANA', 'IA': 'IOWA', 'KS': 'KANSAS',
    'KY': 'KENTUCKY', 'LA': 'LOUISIANA', 'ME': 'MAINE', 'MD': 'MARYLAND',
    'MA': 'MASSACHUSETTS', 'MI': 'MICHIGAN', 'MN': 'MINNESOTA', 'MS': 'MISSISSIPPI',
    'MO': 'MISSOURI', 'MT': 'MONTANA', 'NE': 'NEBRASKA', 'NV': 'NEVADA',
    'NH': 'NEW HAMPSHIRE', 'NJ': 'NEW JERSEY', 'NM': 'NEW MEXICO', 'NY': 'NEW YORK',
    'NC': 'NORTH CAROLINA', 'ND': 'NORTH DAKOTA', 'OH': 'OHIO', 'OK': 'OKLAHOMA',
    'OR': 'OREGON', 'PA': 'PENNSYLVANIA', 'RI': 'RHODE ISLAND', 'SC': 'SOUTH CAROLINA',
    'SD': 'SOUTH DAKOTA', 'TN': 'TENNESSEE', 'TX': 'TEXAS', 'UT': 'UTAH',
    'VT': 'VERMONT', 'VA': 'VIRGINIA', 'WA': 'WASHINGTON', 'WV': 'WEST VIRGINIA',
    'WI': 'WISCONSIN', 'WY': 'WYOMING', 'DC': 'DISTRICT OF COLUMBIA'
}

In [0]:
@udf(returnType=StringType())
def get_state_long_name(state_abbrev):
    return STATE_ABBREV_TO_FULL.get(state_abbrev, state_abbrev)

In [0]:
# Validate data: Check for missing or null values
us_counties_selected = (
    us_counties_selected
    .withColumn('county_name', upper(col('county_name')))
    .withColumn('state_name_long', get_state_long_name(col('state_name_short')))

   
)

In [0]:
silver_path="earth_data.silver.us_counties"
us_counties_selected.write.mode("overwrite").format("delta").saveAsTable(silver_path)