## PROMPT
Generate a pipeline for Databricks:
* Get Data information from two web addresses:
   * Space launches: https://api.spacexdata.com/v3/launches
* Aggregate the launches based on launch year
* Send the chosen information to the web address: https://httpbin.org/post
* The script must provide status updates on its progress, report any errors encountered, confirm the outcome of the final data sending step, and measure/report execution times.


In [0]:
import requests
import json
from time import time
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize logging and timing
start_time = time()
print("Pipeline started at", start_time)

def log_step(message):
    """Helper function to log steps with timing"""
    elapsed = time() - start_time
    print(f"[{elapsed:.2f}s] {message}")

def handle_error(step_name, e):
    """Helper function to handle and log errors"""
    error_msg = f"Error in {step_name}: {str(e)}"
    log_step(error_msg)
    raise Exception(error_msg)

try:
    # Step 1: Fetch data from SpaceX API
    log_step("Fetching data from SpaceX API...")
    try:
        launches_url = "https://api.spacexdata.com/v3/launches"
        response = requests.get(launches_url)
        response.raise_for_status()  # Raises exception for 4XX/5XX errors
        launches_data = response.json()
        log_step(f"Successfully fetched {len(launches_data)} launch records")
    except Exception as e:
        handle_error("fetching SpaceX data", e)

    # Step 2: Create DataFrame from the data
    log_step("Creating Spark DataFrame...")
    try:
        # Define schema for the DataFrame
        schema = StructType([
            StructField("flight_number", IntegerType(), True),
            StructField("mission_name", StringType(), True),
            StructField("launch_year", StringType(), True),
            StructField("launch_success", StringType(), True),
            StructField("details", StringType(), True)
        ])
        
        # Extract relevant fields and create DataFrame
        extracted_data = []
        for launch in launches_data:
            extracted_data.append((
                launch.get("flight_number"),
                launch.get("mission_name"),
                launch.get("launch_year"),
                str(launch.get("launch_success")),
                launch.get("details")
            ))
        
        df = spark.createDataFrame(extracted_data, schema)
        log_step(f"DataFrame created with {df.count()} rows")
    except Exception as e:
        handle_error("creating DataFrame", e)

    # Step 3: Aggregate data by launch year
    log_step("Aggregating data by launch year...")
    try:
        aggregated_df = df.groupBy("launch_year") \
            .agg(
                F.count("*").alias("launch_count"),
                F.sum(F.when(F.col("launch_success") == "True", 1).otherwise(0)).alias("successful_launches"),
                F.sum(F.when(F.col("launch_success") == "False", 1).otherwise(0)).alias("failed_launches")
            ) \
            .orderBy("launch_year")
        
        log_step("Aggregation completed. Sample results:")
        aggregated_df.show(5)
    except Exception as e:
        handle_error("aggregating data", e)

    # Step 4: Prepare data for sending
    log_step("Preparing data for sending...")
    try:
        # Convert to JSON format
        aggregated_json = [row.asDict() for row in aggregated_df.collect()]
        payload = {
            "source": "SpaceX API",
            "aggregated_launches": aggregated_json,
            "total_launches": len(launches_data)
        }
        log_step("Data preparation completed")
    except Exception as e:
        handle_error("preparing data", e)

    # Step 5: Send data to destination
    log_step("Sending data to https://httpbin.org/post...")
    try:
        destination_url = "https://httpbin.org/post"
        headers = {'Content-Type': 'application/json'}
        
        send_start = time()
        response = requests.post(
            destination_url,
            data=json.dumps(payload),
            headers=headers
        )
        send_time = time() - send_start
        
        response.raise_for_status()
        
        log_step(f"Data successfully sent in {send_time:.2f}s")
        log_step(f"Response status: {response.status_code}")
        log_step("Response content sample: " + response.text[:200] + "...")
        
        # Verify the response contains our data
        response_data = response.json()
        if "json" in response_data and response_data["json"].get("source") == "SpaceX API":
            log_step("Destination confirmed receipt of our payload")
        else:
            log_step("Warning: Destination response format unexpected")
    except Exception as e:
        handle_error("sending data", e)

    # Final status
    total_time = time() - start_time
    log_step(f"Pipeline completed successfully in {total_time:.2f} seconds")

except Exception as e:
    log_step("Pipeline failed with error: " + str(e))
    raise e