***************************************
# NASA NEO Data Pipeline
***************************************


### 1) Initialize Spark Session

In [1]:
import requests
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, explode_outer, from_json, expr, to_date
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, LongType, IntegerType
from pyspark.sql.types import  ArrayType, MapType

spark = SparkSession.builder.appName("NASA NEO Data").getOrCreate()

25/05/19 10:30:45 WARN Utils: Your hostname, newlife resolves to a loopback address: 127.0.1.1; using 192.168.0.196 instead (on interface wlp0s20f3)
25/05/19 10:30:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/19 10:30:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### 2) Create Schema and Import Data

In [2]:
# Nested structs for velocity
velocity_struct = StructType([
    StructField("kilometers_per_hour", StringType(), True),
    StructField("kilometers_per_second", StringType(), True),
    StructField("miles_per_hour", StringType(), True)
])

# Nested structs for miss distance
miss_distance_struct = StructType([
    StructField("astronomical", StringType(), True),
    StructField("kilometers", StringType(), True),
    StructField("lunar", StringType(), True),
    StructField("miles", StringType(), True)
])

# Nested structs for close approach data
close_approach_struct = StructType([
    StructField("close_approach_date", StringType(), True),
    StructField("close_approach_date_full", StringType(), True),
    StructField("epoch_date_close_approach", LongType(), True),
    StructField("miss_distance", miss_distance_struct, True),
    StructField("orbiting_body", StringType(), True),
    StructField("relative_velocity", velocity_struct, True)
])

# Nested structs for diameter measurements
diameter_dimension_struct = StructType([
    StructField("estimated_diameter_min", DoubleType(), True),
    StructField("estimated_diameter_max", DoubleType(), True)
])

# Estimated diameter struct with various units
estimated_diameter_struct = StructType([
    StructField("feet", diameter_dimension_struct, True),
    StructField("kilometers", diameter_dimension_struct, True),
    StructField("meters", diameter_dimension_struct, True),
    StructField("miles", diameter_dimension_struct, True)
])

# Links struct
links_struct = StructType([
    StructField("next", StringType(), True),
    StructField("previous", StringType(), True),
    StructField("self", StringType(), True)
])

# Asteroid object struct
asteroid_struct = StructType([
    StructField("absolute_magnitude_h", DoubleType(), True),
    StructField("close_approach_data", ArrayType(close_approach_struct), True),
    StructField("estimated_diameter", estimated_diameter_struct, True),
    StructField("id", StringType(), True),
    StructField("is_potentially_hazardous_asteroid", BooleanType(), True),
    StructField("is_sentry_object", BooleanType(), True),
    StructField("links", links_struct, True),
    StructField("name", StringType(), True),
    StructField("nasa_jpl_url", StringType(), True),
    StructField("neo_reference_id", StringType(), True),
    StructField("sentry_data", StringType(), True)
])

# Main schema with dates as keys and arrays of asteroid objects as values
neo_struct = StructType([
    StructField("2025-05-03", ArrayType(asteroid_struct), True),
    StructField("2025-05-04", ArrayType(asteroid_struct), True),
    StructField("2025-05-05", ArrayType(asteroid_struct), True),
    StructField("2025-05-06", ArrayType(asteroid_struct), True),
    StructField("2025-05-07", ArrayType(asteroid_struct), True),
    StructField("2025-05-08", ArrayType(asteroid_struct), True),
    StructField("2025-05-09", ArrayType(asteroid_struct), True)
])

schema = StructType([
    StructField('links', links_struct),
    StructField('element_count', IntegerType()),
    StructField('near_earth_objects', MapType(StringType(),ArrayType(asteroid_struct)))
])

In [3]:
# Import sample data
json_path = 'sample_data.json'
raw_df = spark.read.option("multiline", "true").schema(schema).json(json_path)

### 3) Process Data with DateFrame API

In [26]:
def process_neo_data(df):
    # Explode the near_earth_objects map to get date and asteroids array
    date_df = df.select(explode(expr("map_entries(near_earth_objects)"))).toDF("date_entry")

    # Extract date and asteroids array
    date_df = date_df.select(
        col("date_entry.key").alias("observation_date"),
        explode(col("date_entry.value")).alias("asteroid")
    )

    # Extract asteroid properties
    asteroid_df = date_df.select(
        col("observation_date"),
        col("asteroid.id").alias("asteroid_id"),
        col("asteroid.neo_reference_id"),
        col("asteroid.name"),
        col("asteroid.nasa_jpl_url"),
        col("asteroid.absolute_magnitude_h"),
        col("asteroid.estimated_diameter.kilometers.estimated_diameter_min").alias("diameter_min_km"),
        col("asteroid.estimated_diameter.kilometers.estimated_diameter_max").alias("diameter_max_km"),
        col("asteroid.is_potentially_hazardous_asteroid"),
        col("asteroid.is_sentry_object"),
        explode(col("asteroid.close_approach_data")).alias("approach")
    )

    # Extract approach data
    approach_df = asteroid_df.select(
        col("observation_date"),
        col("asteroid_id"),
        col("name"),
        col("nasa_jpl_url").alias('url'),
        col("diameter_min_km"),
        col("diameter_max_km"),
        col("is_potentially_hazardous_asteroid").alias('potentially_hazardous'),
        col("is_sentry_object"),
        col("approach.close_approach_date_full").alias('close_approach_date'),
        col("approach.relative_velocity.kilometers_per_hour").alias("velocity_kph"),
        col("approach.miss_distance.kilometers").alias("miss_distance_km"),
        col("approach.orbiting_body")
    )

    # Convert string columns to appropriate types
    final_df = approach_df.withColumn("observation_date", to_date(col("observation_date")))
    final_df = final_df.withColumn("close_approach_date", to_date(col("close_approach_date")))
    final_df = final_df.withColumn("velocity_kph", col("velocity_kph").cast(DoubleType()))
    final_df = final_df.withColumn("miss_distance_km", col("miss_distance_km").cast(DoubleType()))

    return final_df

# Apply the transformation
result_df = process_neo_data(raw_df)

### 4) Query the Results

In [27]:
result_df.createOrReplaceTempView('neo')

In [37]:
query = (
"""
SELECT  
orbiting_body,
count(asteroid_id) AS `asteroid_count`
FROM neo
GROUP BY orbiting_body
"""
)

In [38]:
spark.sql(query).show(120)

+-------------+--------------+
|orbiting_body|asteroid_count|
+-------------+--------------+
|        Earth|           120|
+-------------+--------------+



In [30]:
result_df.columns

['observation_date',
 'asteroid_id',
 'name',
 'url',
 'diameter_min_km',
 'diameter_max_km',
 'potentially_hazardous',
 'is_sentry_object',
 'close_approach_date',
 'velocity_kph',
 'miss_distance_km',
 'orbiting_body']