## PROMPT
Generate a pipeline for Databricks
* Get Data information from two web addresses:
   * Space launches: https://api.spacexdata.com/v3/launches
   * All rockets: https://api.spacexdata.com/v3/rockets
* Do a join operation between the launches and the rockets
* Send the chosen information to the web address: https://httpbin.org/post
* The script must provide status updates on its progress, report any errors encountered, confirm the outcome of the final data sending step, and measure/report execution times.

In [0]:
import requests
import json
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

def log_step(message):
    """Helper function to log pipeline steps with timestamps"""
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def measure_time(operation_name, func, *args, **kwargs):
    """Measure and log execution time of a function"""
    start_time = time.time()
    log_step(f"Starting: {operation_name}")
    
    try:
        result = func(*args, **kwargs)
        elapsed_time = time.time() - start_time
        log_step(f"Completed: {operation_name} in {elapsed_time:.2f} seconds")
        return result
    except Exception as e:
        elapsed_time = time.time() - start_time
        log_step(f"Failed: {operation_name} after {elapsed_time:.2f} seconds. Error: {str(e)}")
        raise

def fetch_data(url):
    """Fetch data from a URL and return as JSON"""
    log_step(f"Fetching data from {url}")
    response = requests.get(url)
    response.raise_for_status()  # Raises exception for 4XX/5XX errors
    return response.json()

def send_data(url, data):
    """Send data to a URL and return response"""
    log_step(f"Sending data to {url}")
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=json.dumps(data), headers=headers)
    response.raise_for_status()
    return response

def main():
    try:
        # Initialize Spark session
        spark = SparkSession.builder \
            .appName("SpaceXDataPipeline") \
            .getOrCreate()
        
        log_step("Pipeline started")
        
        # 1. Fetch data from APIs
        launches_data = measure_time(
            "Fetch SpaceX launches data",
            fetch_data,
            "https://api.spacexdata.com/v3/launches"
        )
        
        rockets_data = measure_time(
            "Fetch SpaceX rockets data",
            fetch_data,
            "https://api.spacexdata.com/v3/rockets"
        )
        
        # 2. Create DataFrames
        launches_df = measure_time(
            "Create launches DataFrame",
            spark.createDataFrame,
            launches_data
        )
        
        rockets_df = measure_time(
            "Create rockets DataFrame",
            spark.createDataFrame,
            rockets_data
        )
        
        # 3. Join DataFrames
        joined_df = measure_time(
            "Join launches and rockets DataFrames",
            lambda: launches_df.join(
                rockets_df,
                launches_df.rocket.rocket_id == rockets_df.rocket_id,
                "left"
            ).select(
                col("flight_number"),
                col("mission_name"),
                col("launch_date_utc"),
                col("rocket_name"),
                col("success"),
                col("details"),
                rockets_df["cost_per_launch"],
                rockets_df["success_rate_pct"]
            )
        )
        
        # 4. Prepare data for sending
        selected_data = measure_time(
            "Collect and prepare data for sending",
            lambda: [row.asDict() for row in joined_df.collect()]
        )
        
        # 5. Send data to endpoint
        response = measure_time(
            "Send data to https://httpbin.org/post",
            send_data,
            "https://httpbin.org/post",
            selected_data
        )
        
        log_step(f"Data successfully sent. Response status: {response.status_code}")
        log_step(f"Response content: {response.text[:200]}...")  # Print first 200 chars
        
    except Exception as e:
        log_step(f"Pipeline failed with error: {str(e)}")
        raise
    finally:
        spark.stop()
        log_step("Pipeline completed (successfully or with errors)")

if __name__ == "__main__":
    main()

Take the output and give to the LLM to resolve


In [0]:
import requests
import json
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, DoubleType, MapType, ArrayType, TimestampType

def log_step(message):
    """Helper function to log pipeline steps with timestamps"""
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def measure_time(operation_name, func, *args, **kwargs):
    """Measure and log execution time of a function"""
    start_time = time.time()
    log_step(f"Starting: {operation_name}")
    
    try:
        result = func(*args, **kwargs)
        elapsed_time = time.time() - start_time
        log_step(f"Completed: {operation_name} in {elapsed_time:.2f} seconds")
        return result
    except Exception as e:
        elapsed_time = time.time() - start_time
        log_step(f"Failed: {operation_name} after {elapsed_time:.2f} seconds. Error: {str(e)}")
        raise

def fetch_data(url):
    """Fetch data from a URL and return as JSON"""
    log_step(f"Fetching data from {url}")
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def send_data(url, data):
    """Send data to a URL and return response"""
    log_step(f"Sending data to {url}")
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=json.dumps(data), headers=headers)
    response.raise_for_status()
    return response

def get_launches_schema():
    """Define schema for launches data"""
    return StructType([
        StructField("flight_number", IntegerType()),
        StructField("mission_name", StringType()),
        StructField("launch_date_utc", StringType()),
        StructField("rocket", StructType([
            StructField("rocket_id", StringType()),
            StructField("rocket_name", StringType()),
            StructField("rocket_type", StringType())
        ])),
        StructField("launch_success", BooleanType()),
        StructField("details", StringType()),
        StructField("links", MapType(StringType(), StringType())),  # Handle complex links object
    ])

def get_rockets_schema():
    """Define schema for rockets data"""
    return StructType([
        StructField("rocket_id", StringType()),
        StructField("rocket_name", StringType()),
        StructField("cost_per_launch", IntegerType()),
        StructField("success_rate_pct", IntegerType()),
        StructField("first_flight", StringType()),
        StructField("country", StringType()),
        StructField("company", StringType()),
    ])

def main():
    try:
        # Initialize Spark session
        spark = SparkSession.builder \
            .appName("SpaceXDataPipeline") \
            .getOrCreate()
        
        log_step("Pipeline started")
        
        # 1. Fetch data from APIs
        launches_data = measure_time(
            "Fetch SpaceX launches data",
            fetch_data,
            "https://api.spacexdata.com/v3/launches"
        )
        
        rockets_data = measure_time(
            "Fetch SpaceX rockets data",
            fetch_data,
            "https://api.spacexdata.com/v3/rockets"
        )
        
        # 2. Create DataFrames with explicit schemas
        launches_df = measure_time(
            "Create launches DataFrame",
            lambda: spark.createDataFrame(
                data=launches_data,
                schema=get_launches_schema()
            )
        )
        
        rockets_df = measure_time(
            "Create rockets DataFrame",
            lambda: spark.createDataFrame(
                data=rockets_data,
                schema=get_rockets_schema()
            )
        )
        
        # 3. Join DataFrames
        joined_df = measure_time(
            "Join launches and rockets DataFrames",
            lambda: launches_df.join(
                rockets_df,
                launches_df.rocket.rocket_id == rockets_df.rocket_id,
                "left"
            ).select(
                col("flight_number"),
                col("mission_name"),
                col("launch_date_utc"),
                col("rocket_name"),
                col("launch_success").alias("success"),
                col("details"),
                col("cost_per_launch"),
                col("success_rate_pct")
            )
        )
        
        # 4. Prepare data for sending
        selected_data = measure_time(
            "Collect and prepare data for sending",
            lambda: [row.asDict() for row in joined_df.collect()]
        )
        
        # 5. Send data to endpoint
        response = measure_time(
            "Send data to https://httpbin.org/post",
            send_data,
            "https://httpbin.org/post",
            selected_data
        )
        
        log_step(f"Data successfully sent. Response status: {response.status_code}")
        log_step(f"Response content: {response.text[:200]}...")  # Print first 200 chars
        
    except Exception as e:
        log_step(f"Pipeline failed with error: {str(e)}")
        raise
    finally:
        spark.stop()
        log_step("Pipeline completed (successfully or with errors)")

if __name__ == "__main__":
    main()