## PROMPT
Generate a pipeline for Databricks:
* Get Data information from two web addresses:
   * Space launches: https://api.spacexdata.com/v3/launches
* Filter the list of launches based on launch year and launch success status.
* Send the chosen information to the web address: https://httpbin.org/post
* The script must provide status updates on its progress, report any errors encountered, confirm the outcome of the final data sending step, and measure/report execution times.

In [0]:
# SpaceX Data Pipeline for Databricks
# This pipeline fetches SpaceX launch data, filters it, and posts the results to httpbin

import requests
import json
import time
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("SpaceXPipeline")

# Initialize Spark session
spark = SparkSession.builder.appName("SpaceXDataPipeline").getOrCreate()

def get_spacex_data():
    """Fetch SpaceX launch data from API"""
    start_time = time.time()
    logger.info("Starting data collection from SpaceX API")
    
    try:
        response = requests.get("https://api.spacexdata.com/v3/launches", timeout=30)
        response.raise_for_status()
        data = response.json()
        logger.info(f"Successfully retrieved {len(data)} launch records from SpaceX API")
        
        elapsed_time = time.time() - start_time
        logger.info(f"Data collection completed in {elapsed_time:.2f} seconds")
        
        return data
    
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching data from SpaceX API: {str(e)}")
        raise

def filter_launch_data(data, year=None, success=None):
    """Filter launch data based on year and success status"""
    start_time = time.time()
    logger.info(f"Filtering data with parameters - year: {year}, success: {success}")
    
    # Convert to DataFrame for easier filtering
    df = spark.createDataFrame(data)
    
    # Apply filters if specified
    if year is not None:
        df = df.filter(col("launch_year") == str(year))
    
    if success is not None:
        df = df.filter(col("launch_success") == success)
    
    # Convert back to Python objects
    filtered_data = df.collect()
    filtered_data = [row.asDict() for row in filtered_data]
    
    elapsed_time = time.time() - start_time
    logger.info(f"Filtering completed in {elapsed_time:.2f} seconds")
    logger.info(f"Filtered data contains {len(filtered_data)} records")
    
    return filtered_data

def send_data_to_endpoint(data):
    """Send filtered data to external endpoint"""
    start_time = time.time()
    logger.info("Sending filtered data to external endpoint")
    
    try:
        # Prepare payload
        payload = {
            "source": "SpaceX API",
            "processed_date": datetime.now().isoformat(),
            "record_count": len(data),
            "data": data
        }
        
        # Send data
        response = requests.post(
            "https://httpbin.org/post", 
            json=payload,
            headers={"Content-Type": "application/json"},
            timeout=30
        )
        response.raise_for_status()
        
        # Log response
        elapsed_time = time.time() - start_time
        logger.info(f"Data successfully sent to endpoint in {elapsed_time:.2f} seconds")
        logger.info(f"Response status code: {response.status_code}")
        
        return response.json()
    
    except requests.exceptions.RequestException as e:
        logger.error(f"Error sending data to endpoint: {str(e)}")
        raise

def run_pipeline(filter_year=2020, filter_success=True):
    """Execute the complete data pipeline"""
    overall_start_time = time.time()
    logger.info("Starting SpaceX data pipeline execution")
    
    try:
        # Step 1: Get data
        spacex_data = get_spacex_data()
        
        # Step 2: Filter data
        filtered_data = filter_launch_data(spacex_data, year=filter_year, success=filter_success)
        
        # Step 3: Send data to endpoint
        if filtered_data:
            response = send_data_to_endpoint(filtered_data)
            logger.info("Pipeline execution completed successfully")
        else:
            logger.warning("No data to send after filtering")
        
        # Calculate total execution time
        total_execution_time = time.time() - overall_start_time
        logger.info(f"Total pipeline execution time: {total_execution_time:.2f} seconds")
        
        return {
            "status": "success",
            "execution_time": total_execution_time,
            "records_processed": len(filtered_data),
            "filter_criteria": {
                "year": filter_year,
                "success": filter_success
            }
        }
    
    except Exception as e:
        logger.error(f"Pipeline execution failed: {str(e)}")
        
        # Calculate execution time even for failed runs
        total_execution_time = time.time() - overall_start_time
        logger.info(f"Failed pipeline execution time: {total_execution_time:.2f} seconds")
        
        return {
            "status": "failed",
            "execution_time": total_execution_time,
            "error": str(e)
        }

# Execute the pipeline (can be parameterized as needed)
if __name__ == "__main__":
    # Example: Run pipeline for successful launches from 2020
    result = run_pipeline(filter_year=2020, filter_success=True)
    print(json.dumps(result, indent=2))

1 error


In [0]:
# SpaceX Data Pipeline for Databricks
# This pipeline fetches SpaceX launch data, filters it, and posts the results to httpbin

import requests
import json
import time
from datetime import datetime
import logging
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_json, struct
from pyspark.sql.types import *

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("SpaceXPipeline")

# Initialize Spark session
spark = SparkSession.builder.appName("SpaceXDataPipeline").getOrCreate()

def get_spacex_data():
    """Fetch SpaceX launch data from API"""
    start_time = time.time()
    logger.info("Starting data collection from SpaceX API")
    
    try:
        response = requests.get("https://api.spacexdata.com/v3/launches", timeout=30)
        response.raise_for_status()
        data = response.json()
        logger.info(f"Successfully retrieved {len(data)} launch records from SpaceX API")
        
        elapsed_time = time.time() - start_time
        logger.info(f"Data collection completed in {elapsed_time:.2f} seconds")
        
        return data
    
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching data from SpaceX API: {str(e)}")
        raise

def extract_launch_details(data):
    """Extract only the required fields from complex launch data"""
    simplified_data = []
    
    for launch in data:
        # Extract only the fields we need
        simplified_launch = {
            "flight_number": launch.get("flight_number"),
            "mission_name": launch.get("mission_name"),
            "launch_year": launch.get("launch_year"),
            "launch_date_utc": launch.get("launch_date_utc"),
            "launch_success": launch.get("launch_success"),
            "rocket_name": launch.get("rocket", {}).get("rocket_name"),
            "details": launch.get("details"),
            # Add any other fields you need
        }
        simplified_data.append(simplified_launch)
    
    return simplified_data

def filter_launch_data(data, year=None, success=None):
    """Filter launch data based on year and success status"""
    start_time = time.time()
    logger.info(f"Filtering data with parameters - year: {year}, success: {success}")
    
    try:
        # Extract only the fields we need to avoid schema inference issues
        simplified_data = extract_launch_details(data)
        
        # Convert to pandas DataFrame first
        pandas_df = pd.DataFrame(simplified_data)
        
        # Convert pandas DataFrame to Spark DataFrame
        df = spark.createDataFrame(pandas_df)
        
        # Apply filters if specified
        if year is not None:
            df = df.filter(col("launch_year") == str(year))
        
        if success is not None:
            df = df.filter(col("launch_success") == success)
        
        # Convert back to Python objects
        filtered_data = df.toPandas().to_dict('records')
        
        elapsed_time = time.time() - start_time
        logger.info(f"Filtering completed in {elapsed_time:.2f} seconds")
        logger.info(f"Filtered data contains {len(filtered_data)} records")
        
        return filtered_data
    
    except Exception as e:
        logger.error(f"Error filtering data: {str(e)}")
        raise

def send_data_to_endpoint(data):
    """Send filtered data to external endpoint"""
    start_time = time.time()
    logger.info("Sending filtered data to external endpoint")
    
    try:
        # Prepare payload
        payload = {
            "source": "SpaceX API",
            "processed_date": datetime.now().isoformat(),
            "record_count": len(data),
            "data": data
        }
        
        # Send data
        response = requests.post(
            "https://httpbin.org/post", 
            json=payload,
            headers={"Content-Type": "application/json"},
            timeout=30
        )
        response.raise_for_status()
        
        # Log response
        elapsed_time = time.time() - start_time
        logger.info(f"Data successfully sent to endpoint in {elapsed_time:.2f} seconds")
        logger.info(f"Response status code: {response.status_code}")
        
        return response.json()
    
    except requests.exceptions.RequestException as e:
        logger.error(f"Error sending data to endpoint: {str(e)}")
        raise

def run_pipeline(filter_year=2020, filter_success=True):
    """Execute the complete data pipeline"""
    overall_start_time = time.time()
    logger.info("Starting SpaceX data pipeline execution")
    
    try:
        # Step 1: Get data
        spacex_data = get_spacex_data()
        
        # Step 2: Filter data
        filtered_data = filter_launch_data(spacex_data, year=filter_year, success=filter_success)
        
        # Step 3: Send data to endpoint
        if filtered_data:
            response = send_data_to_endpoint(filtered_data)
            logger.info("Pipeline execution completed successfully")
        else:
            logger.warning("No data to send after filtering")
        
        # Calculate total execution time
        total_execution_time = time.time() - overall_start_time
        logger.info(f"Total pipeline execution time: {total_execution_time:.2f} seconds")
        
        return {
            "status": "success",
            "execution_time": total_execution_time,
            "records_processed": len(filtered_data),
            "filter_criteria": {
                "year": filter_year,
                "success": filter_success
            }
        }
    
    except Exception as e:
        logger.error(f"Pipeline execution failed: {str(e)}")
        
        # Calculate execution time even for failed runs
        total_execution_time = time.time() - overall_start_time
        logger.info(f"Failed pipeline execution time: {total_execution_time:.2f} seconds")
        
        return {
            "status": "failed",
            "execution_time": total_execution_time,
            "error": str(e)
        }

# Execute the pipeline (can be parameterized as needed)
if __name__ == "__main__":
    # Example: Run pipeline for successful launches from 2020
    result = run_pipeline(filter_year=2020, filter_success=True)
    print(json.dumps(result, indent=2))