## PROMPT
Generate a pipeline for Databricks
* Get Data information from two web addresses:
   * Space launches: https://api.spacexdata.com/v3/launches
   * All rockets: https://api.spacexdata.com/v3/rockets
* Do a join operation between the launches and the rockets
* Send the chosen information to the web address: https://httpbin.org/post
* The script must provide status updates on its progress, report any errors encountered, confirm the outcome of the final data sending step, and measure/report execution times.

In [0]:
# Databricks SpaceX Data Pipeline
# This pipeline fetches SpaceX launch and rocket data, joins them, and sends the result to an endpoint

import requests
import json
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, explode
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, BooleanType, TimestampType, DoubleType

# Initialize Spark Session
spark = SparkSession.builder.appName("SpaceX Data Pipeline").getOrCreate()

def log_message(message):
    """Log a message with timestamp"""
    print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {message}")

def measure_execution_time(func):
    """Decorator to measure and log execution time of functions"""
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        log_message(f"{func.__name__} completed in {execution_time:.2f} seconds")
        return result
    return wrapper

@measure_execution_time
def fetch_data(url, endpoint_name):
    """Fetch JSON data from a URL"""
    log_message(f"Fetching {endpoint_name} data from {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        data = response.json()
        log_message(f"Successfully fetched {len(data)} {endpoint_name} records")
        return data
    except requests.exceptions.RequestException as e:
        log_message(f"Error fetching {endpoint_name} data: {str(e)}")
        raise

@measure_execution_time
def create_dataframe(data, schema_fields, data_type):
    """Create a DataFrame from JSON data"""
    log_message(f"Creating {data_type} DataFrame")
    try:
        schema = StructType(schema_fields)
        df = spark.createDataFrame(data, schema)
        log_message(f"Successfully created DataFrame with {df.count()} rows and {len(df.columns)} columns")
        return df
    except Exception as e:
        log_message(f"Error creating {data_type} DataFrame: {str(e)}")
        raise

@measure_execution_time
def join_dataframes(launches_df, rockets_df):
    """Join launches and rockets DataFrames"""
    log_message("Joining launches and rockets DataFrames")
    try:
        joined_df = launches_df.join(
            rockets_df,
            launches_df.rocket_id == rockets_df.rocket_id,
            "inner"
        )
        log_message(f"Successfully joined DataFrames, resulting in {joined_df.count()} rows")
        return joined_df
    except Exception as e:
        log_message(f"Error joining DataFrames: {str(e)}")
        raise

@measure_execution_time
def process_data(joined_df):
    """Process and transform the joined data"""
    log_message("Processing and transforming joined data")
    try:
        # Select relevant columns and rename for clarity
        processed_df = joined_df.select(
            col("flight_number"),
            col("mission_name"),
            col("launch_year"),
            col("launch_success"),
            col("details").alias("mission_details"),
            col("rocket_name"),
            col("rocket_type"),
            col("cost_per_launch"),
            col("success_rate_pct"),
            col("description").alias("rocket_description")
        )
        
        log_message(f"Data processing complete. Result has {processed_df.count()} rows")
        return processed_df
    except Exception as e:
        log_message(f"Error processing data: {str(e)}")
        raise

@measure_execution_time
def send_data_to_endpoint(processed_df, url):
    """Send processed data to an HTTP endpoint"""
    log_message(f"Sending processed data to {url}")
    try:
        # Convert DataFrame to a list of dictionaries
        data_to_send = processed_df.limit(100).toPandas().to_dict(orient="records")
        
        # Send data to the endpoint
        response = requests.post(
            url,
            json=data_to_send,
            headers={"Content-Type": "application/json"}
        )
        response.raise_for_status()
        
        log_message(f"Successfully sent data to endpoint. Response status: {response.status_code}")
        log_message(f"Response body: {response.text[:200]}...")
        return response
    except Exception as e:
        log_message(f"Error sending data to endpoint: {str(e)}")
        raise

def main():
    """Main pipeline function"""
    pipeline_start_time = time.time()
    log_message("Starting SpaceX data pipeline")
    
    try:
        # Define API endpoints
        launches_url = "https://api.spacexdata.com/v3/launches"
        rockets_url = "https://api.spacexdata.com/v3/rockets"
        output_url = "https://httpbin.org/post"
        
        # Define schemas for launches and rockets
        launches_schema = [
            StructField("flight_number", IntegerType(), True),
            StructField("mission_name", StringType(), True),
            StructField("launch_year", StringType(), True),
            StructField("launch_date_utc", StringType(), True),
            StructField("launch_success", BooleanType(), True),
            StructField("details", StringType(), True),
            StructField("rocket", StructType([
                StructField("rocket_id", StringType(), True),
                StructField("rocket_name", StringType(), True),
                StructField("rocket_type", StringType(), True)
            ]), True)
        ]
        
        rockets_schema = [
            StructField("id", StringType(), True),
            StructField("name", StringType(), True),
            StructField("type", StringType(), True),
            StructField("active", BooleanType(), True),
            StructField("cost_per_launch", IntegerType(), True),
            StructField("success_rate_pct", IntegerType(), True),
            StructField("description", StringType(), True)
        ]
        
        # Fetch data
        launches_data = fetch_data(launches_url, "launches")
        rockets_data = fetch_data(rockets_url, "rockets")
        
        # Extract rocket ID from nested structure in launches data
        for launch in launches_data:
            if launch.get("rocket"):
                launch["rocket_id"] = launch["rocket"].get("rocket_id")
                launch["rocket_name"] = launch["rocket"].get("rocket_name")
                launch["rocket_type"] = launch["rocket"].get("rocket_type")
        
        # Rename rocket id field for joining
        for rocket in rockets_data:
            rocket["rocket_id"] = rocket.pop("id")
            rocket["rocket_name"] = rocket.pop("name")
            rocket["rocket_type"] = rocket.pop("type")
        
        # Create DataFrames
        launches_df = create_dataframe(launches_data, launches_schema, "launches")
        rockets_df = create_dataframe(rockets_data, rockets_schema, "rockets")
        
        # Join DataFrames
        joined_df = join_dataframes(launches_df, rockets_df)
        
        # Process data
        processed_df = process_data(joined_df)
        
        # Display sample data
        log_message("Sample of processed data:")
        processed_df.show(5, truncate=False)
        
        # Send data to endpoint
        response = send_data_to_endpoint(processed_df, output_url)
        
        pipeline_end_time = time.time()
        total_execution_time = pipeline_end_time - pipeline_start_time
        log_message(f"Pipeline completed successfully in {total_execution_time:.2f} seconds")
        
    except Exception as e:
        log_message(f"Pipeline failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Take the output and give to the LLM to resolve


In [0]:
# Databricks SpaceX Data Pipeline
# This pipeline fetches SpaceX launch and rocket data, joins them, and sends the result to an endpoint

import requests
import json
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, explode
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, BooleanType, TimestampType, DoubleType

# Initialize Spark Session
spark = SparkSession.builder.appName("SpaceX Data Pipeline").getOrCreate()

def log_message(message):
    """Log a message with timestamp"""
    print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {message}")

def measure_execution_time(func):
    """Decorator to measure and log execution time of functions"""
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        log_message(f"{func.__name__} completed in {execution_time:.2f} seconds")
        return result
    return wrapper

@measure_execution_time
def fetch_data(url, endpoint_name):
    """Fetch JSON data from a URL"""
    log_message(f"Fetching {endpoint_name} data from {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        data = response.json()
        log_message(f"Successfully fetched {len(data)} {endpoint_name} records")
        return data
    except requests.exceptions.RequestException as e:
        log_message(f"Error fetching {endpoint_name} data: {str(e)}")
        raise

@measure_execution_time
def process_launches_data(launches_data):
    """Process launches data to flatten nested rocket information"""
    log_message("Processing launches data")
    try:
        processed_launches = []
        for launch in launches_data:
            launch_dict = {
                "flight_number": launch.get("flight_number"),
                "mission_name": launch.get("mission_name"),
                "launch_year": launch.get("launch_year"),
                "launch_date_utc": launch.get("launch_date_utc"),
                "launch_success": launch.get("launch_success"),
                "details": launch.get("details"),
                "rocket_id": launch.get("rocket", {}).get("rocket_id"),
                "rocket_name": launch.get("rocket", {}).get("rocket_name"),
                "rocket_type": launch.get("rocket", {}).get("rocket_type")
            }
            processed_launches.append(launch_dict)
        
        log_message(f"Successfully processed {len(processed_launches)} launch records")
        return processed_launches
    except Exception as e:
        log_message(f"Error processing launches data: {str(e)}")
        raise

@measure_execution_time
def create_dataframe(data, schema_fields, data_type):
    """Create a DataFrame from JSON data"""
    log_message(f"Creating {data_type} DataFrame")
    try:
        schema = StructType(schema_fields)
        df = spark.createDataFrame(data, schema)
        log_message(f"Successfully created DataFrame with {df.count()} rows and {len(df.columns)} columns")
        return df
    except Exception as e:
        log_message(f"Error creating {data_type} DataFrame: {str(e)}")
        raise

@measure_execution_time
def join_dataframes(launches_df, rockets_df):
    """Join launches and rockets DataFrames"""
    log_message("Joining launches and rockets DataFrames")
    try:
        joined_df = launches_df.join(
            rockets_df,
            launches_df.rocket_id == rockets_df.id,
            "inner"
        )
        log_message(f"Successfully joined DataFrames, resulting in {joined_df.count()} rows")
        return joined_df
    except Exception as e:
        log_message(f"Error joining DataFrames: {str(e)}")
        raise

@measure_execution_time
def process_data(joined_df):
    """Process and transform the joined data"""
    log_message("Processing and transforming joined data")
    try:
        # Select relevant columns and rename for clarity
        processed_df = joined_df.select(
            col("flight_number"),
            col("mission_name"),
            col("launch_year"),
            col("launch_success"),
            col("details").alias("mission_details"),
            col("rocket_name"),
            col("rocket_type"),
            col("cost_per_launch"),
            col("success_rate_pct"),
            col("description").alias("rocket_description")
        )
        
        log_message(f"Data processing complete. Result has {processed_df.count()} rows")
        return processed_df
    except Exception as e:
        log_message(f"Error processing data: {str(e)}")
        raise

@measure_execution_time
def send_data_to_endpoint(processed_df, url):
    """Send processed data to an HTTP endpoint"""
    log_message(f"Sending processed data to {url}")
    try:
        # Convert DataFrame to a list of dictionaries
        data_to_send = processed_df.limit(100).toPandas().to_dict(orient="records")
        
        # Send data to the endpoint
        response = requests.post(
            url,
            json=data_to_send,
            headers={"Content-Type": "application/json"}
        )
        response.raise_for_status()
        
        log_message(f"Successfully sent data to endpoint. Response status: {response.status_code}")
        log_message(f"Response body: {response.text[:200]}...")
        return response
    except Exception as e:
        log_message(f"Error sending data to endpoint: {str(e)}")
        raise

def main():
    """Main pipeline function"""
    pipeline_start_time = time.time()
    log_message("Starting SpaceX data pipeline")
    
    try:
        # Define API endpoints
        launches_url = "https://api.spacexdata.com/v3/launches"
        rockets_url = "https://api.spacexdata.com/v3/rockets"
        output_url = "https://httpbin.org/post"
        
        # Define schemas for launches and rockets
        launches_schema = [
            StructField("flight_number", IntegerType(), True),
            StructField("mission_name", StringType(), True),
            StructField("launch_year", StringType(), True),
            StructField("launch_date_utc", StringType(), True),
            StructField("launch_success", BooleanType(), True),
            StructField("details", StringType(), True),
            StructField("rocket_id", StringType(), True),
            StructField("rocket_name", StringType(), True),
            StructField("rocket_type", StringType(), True)
        ]
        
        rockets_schema = [
            StructField("id", StringType(), True),
            StructField("name", StringType(), True),
            StructField("type", StringType(), True),
            StructField("active", BooleanType(), True),
            StructField("cost_per_launch", IntegerType(), True),
            StructField("success_rate_pct", IntegerType(), True),
            StructField("description", StringType(), True)
        ]
        
        # Fetch data
        launches_data = fetch_data(launches_url, "launches")
        rockets_data = fetch_data(rockets_url, "rockets")
        
        # Process launches data to flatten nested rocket information
        processed_launches = process_launches_data(launches_data)
        
        # Create DataFrames
        launches_df = create_dataframe(processed_launches, launches_schema, "launches")
        rockets_df = create_dataframe(rockets_data, rockets_schema, "rockets")
        
        # Join DataFrames
        joined_df = join_dataframes(launches_df, rockets_df)
        
        # Process data
        processed_df = process_data(joined_df)
        
        # Display sample data
        log_message("Sample of processed data:")
        processed_df.show(5, truncate=False)
        
        # Send data to endpoint
        response = send_data_to_endpoint(processed_df, output_url)
        
        pipeline_end_time = time.time()
        total_execution_time = pipeline_end_time - pipeline_start_time
        log_message(f"Pipeline completed successfully in {total_execution_time:.2f} seconds")
        
    except Exception as e:
        log_message(f"Pipeline failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()