## PROMPT
Generate a pipeline for Databricks:
* Get Data information from two web addresses:
   * Space launches: https://api.spacexdata.com/v3/launches
* Filter the list of launches based on launch year and launch success status.
* Send the chosen information to the web address: https://httpbin.org/post
* The script must provide status updates on its progress, report any errors encountered, confirm the outcome of the final data sending step, and measure/report execution times.



In [0]:
import requests
import json
import time
from datetime import datetime
from pyspark.sql.functions import col

def log_step(message):
    """Helper function to log pipeline steps with timestamps"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def measure_time(func):
    """Decorator to measure execution time of functions"""
    def wrapper(*args, **kwargs):
        start_time = time.time()
        log_step(f"Starting: {func.__name__}")
        result = func(*args, **kwargs)
        end_time = time.time()
        log_step(f"Completed: {func.__name__} in {end_time - start_time:.2f} seconds")
        return result
    return wrapper

@measure_time
def fetch_spacex_launches():
    """Fetch SpaceX launches data from API"""
    url = "https://api.spacexdata.com/v3/launches"
    log_step(f"Fetching data from {url}")
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises exception for 4XX/5XX errors
        data = response.json()
        log_step(f"Successfully fetched {len(data)} launches")
        return data
    except requests.exceptions.RequestException as e:
        log_step(f"Error fetching SpaceX data: {str(e)}")
        raise

@measure_time
def filter_launches(launches, year=None, success_status=None):
    """Filter launches by year and success status"""
    log_step(f"Filtering {len(launches)} launches (year: {year}, success: {success_status})")
    
    try:
        # Convert to Spark DataFrame for efficient filtering
        df = spark.createDataFrame(launches)
        
        # Apply filters if provided
        if year is not None:
            df = df.filter(col("launch_year") == str(year))
        if success_status is not None:
            df = df.filter(col("launch_success") == success_status)
            
        filtered_data = [row.asDict() for row in df.collect()]
        log_step(f"Found {len(filtered_data)} matching launches after filtering")
        return filtered_data
    except Exception as e:
        log_step(f"Error filtering launches: {str(e)}")
        raise

@measure_time
def send_to_endpoint(data, endpoint_url):
    """Send filtered data to specified endpoint"""
    log_step(f"Sending {len(data)} records to {endpoint_url}")
    
    try:
        headers = {'Content-Type': 'application/json'}
        response = requests.post(endpoint_url, data=json.dumps(data), headers=headers)
        response.raise_for_status()
        
        log_step(f"Successfully sent data. Response status: {response.status_code}")
        log_step(f"Response content: {response.text[:200]}...")  # Log first 200 chars of response
        return response
    except requests.exceptions.RequestException as e:
        log_step(f"Error sending data to endpoint: {str(e)}")
        raise

def main():
    """Main pipeline execution"""
    try:
        # Configuration parameters
        launch_year = 2020  # Set to None to disable year filter
        success_status = True  # Set to None to disable success filter
        endpoint_url = "https://httpbin.org/post"
        
        log_step("Starting SpaceX Data Pipeline")
        
        # Step 1: Fetch data
        launches = fetch_spacex_launches()
        
        # Step 2: Filter data
        filtered_launches = filter_launches(launches, year=launch_year, success_status=success_status)
        
        # Step 3: Send data
        send_response = send_to_endpoint(filtered_launches, endpoint_url)
        
        log_step("Pipeline completed successfully!")
        return True
        
    except Exception as e:
        log_step(f"Pipeline failed with error: {str(e)}")
        return False

# Execute the pipeline
if __name__ == "__main__":
    pipeline_success = main()


1 error

In [0]:
import requests
import json
import time
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, MapType
from pyspark.sql.functions import col

def log_step(message):
    """Helper function to log pipeline steps with timestamps"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def measure_time(func):
    """Decorator to measure execution time of functions"""
    def wrapper(*args, **kwargs):
        start_time = time.time()
        log_step(f"Starting: {func.__name__}")
        result = func(*args, **kwargs)
        end_time = time.time()
        log_step(f"Completed: {func.__name__} in {end_time - start_time:.2f} seconds")
        return result
    return wrapper

@measure_time
def fetch_spacex_launches():
    """Fetch SpaceX launches data from API"""
    url = "https://api.spacexdata.com/v3/launches"
    log_step(f"Fetching data from {url}")
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        log_step(f"Successfully fetched {len(data)} launches")
        return data
    except requests.exceptions.RequestException as e:
        log_step(f"Error fetching SpaceX data: {str(e)}")
        raise

def create_launches_schema():
    """Define schema for SpaceX launches data"""
    return StructType([
        StructField("flight_number", IntegerType(), True),
        StructField("mission_name", StringType(), True),
        StructField("launch_year", StringType(), True),
        StructField("launch_date_utc", StringType(), True),
        StructField("launch_success", BooleanType(), True),
        StructField("rocket", MapType(StringType(), StringType()), True),
        StructField("links", MapType(StringType(), StringType()), True),
        StructField("details", StringType(), True)
    ])

@measure_time
def filter_launches(launches, year=None, success_status=None):
    """Filter launches by year and success status"""
    log_step(f"Filtering {len(launches)} launches (year: {year}, success: {success_status})")
    
    try:
        # Create DataFrame with explicit schema
        schema = create_launches_schema()
        rdd = spark.sparkContext.parallelize(launches)
        df = spark.read.schema(schema).json(rdd.map(lambda x: json.dumps(x)))
        
        # Apply filters if provided
        if year is not None:
            df = df.filter(col("launch_year") == str(year))
        if success_status is not None:
            df = df.filter(col("launch_success") == success_status)
            
        filtered_data = [row.asDict() for row in df.collect()]
        log_step(f"Found {len(filtered_data)} matching launches after filtering")
        return filtered_data
    except Exception as e:
        log_step(f"Error filtering launches: {str(e)}")
        raise

@measure_time
def send_to_endpoint(data, endpoint_url):
    """Send filtered data to specified endpoint"""
    log_step(f"Sending {len(data)} records to {endpoint_url}")
    
    try:
        headers = {'Content-Type': 'application/json'}
        response = requests.post(endpoint_url, data=json.dumps(data), headers=headers)
        response.raise_for_status()
        
        log_step(f"Successfully sent data. Response status: {response.status_code}")
        log_step(f"Response content: {response.text[:200]}...")
        return response
    except requests.exceptions.RequestException as e:
        log_step(f"Error sending data to endpoint: {str(e)}")
        raise

def main():
    """Main pipeline execution"""
    try:
        # Configuration parameters
        launch_year = 2020  # Set to None to disable year filter
        success_status = True  # Set to None to disable success filter
        endpoint_url = "https://httpbin.org/post"
        
        log_step("Starting SpaceX Data Pipeline")
        
        # Step 1: Fetch data
        launches = fetch_spacex_launches()
        
        # Step 2: Filter data
        filtered_launches = filter_launches(launches, year=launch_year, success_status=success_status)
        
        # Step 3: Send data
        send_response = send_to_endpoint(filtered_launches, endpoint_url)
        
        log_step("Pipeline completed successfully!")
        return True
        
    except Exception as e:
        log_step(f"Pipeline failed with error: {str(e)}")
        return False

# Execute the pipeline
if __name__ == "__main__":
    pipeline_success = main()

2 iteration


In [0]:
import requests
import json
import time
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, MapType

def log_step(message):
    """Helper function to log pipeline steps with timestamps"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def measure_time(func):
    """Decorator to measure execution time of functions"""
    def wrapper(*args, **kwargs):
        start_time = time.time()
        log_step(f"Starting: {func.__name__}")
        result = func(*args, **kwargs)
        end_time = time.time()
        log_step(f"Completed: {func.__name__} in {end_time - start_time:.2f} seconds")
        return result
    return wrapper

@measure_time
def fetch_spacex_launches():
    """Fetch SpaceX launches data from API"""
    url = "https://api.spacexdata.com/v3/launches"
    log_step(f"Fetching data from {url}")
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        log_step(f"Successfully fetched {len(data)} launches")
        return data
    except requests.exceptions.RequestException as e:
        log_step(f"Error fetching SpaceX data: {str(e)}")
        raise

def create_launches_schema():
    """Define schema for SpaceX launches data"""
    return StructType([
        StructField("flight_number", IntegerType(), True),
        StructField("mission_name", StringType(), True),
        StructField("launch_year", StringType(), True),
        StructField("launch_date_utc", StringType(), True),
        StructField("launch_success", BooleanType(), True),
        StructField("rocket", MapType(StringType(), StringType()), True),
        StructField("links", MapType(StringType(), StringType()), True),
        StructField("details", StringType(), True)
    ])

@measure_time
def filter_launches(launches, year=None, success_status=None):
    """Filter launches by year and success status"""
    log_step(f"Filtering {len(launches)} launches (year: {year}, success: {success_status})")
    
    try:
        # Create DataFrame without using sparkContext
        schema = create_launches_schema()
        df = spark.createDataFrame(launches, schema=schema)
        
        # Apply filters if provided
        if year is not None:
            df = df.filter(col("launch_year") == str(year))
        if success_status is not None:
            df = df.filter(col("launch_success") == success_status)
            
        filtered_data = [row.asDict() for row in df.collect()]
        log_step(f"Found {len(filtered_data)} matching launches after filtering")
        return filtered_data
    except Exception as e:
        log_step(f"Error filtering launches: {str(e)}")
        raise

@measure_time
def send_to_endpoint(data, endpoint_url):
    """Send filtered data to specified endpoint"""
    log_step(f"Sending {len(data)} records to {endpoint_url}")
    
    try:
        headers = {'Content-Type': 'application/json'}
        response = requests.post(endpoint_url, data=json.dumps(data), headers=headers)
        response.raise_for_status()
        
        log_step(f"Successfully sent data. Response status: {response.status_code}")
        log_step(f"Response content: {response.text[:200]}...")
        return response
    except requests.exceptions.RequestException as e:
        log_step(f"Error sending data to endpoint: {str(e)}")
        raise

def main():
    """Main pipeline execution"""
    try:
        # Configuration parameters
        launch_year = 2020  # Set to None to disable year filter
        success_status = True# Set to None to disable success filter
        endpoint_url = "https://httpbin.org/post"
        
        log_step("Starting SpaceX Data Pipeline")
        
        # Step 1: Fetch data
        launches = fetch_spacex_launches()
        
        # Step 2: Filter data
        filtered_launches = filter_launches(launches, year=launch_year, success_status=success_status)
        
        # Step 3: Send data
        send_response = send_to_endpoint(filtered_launches, endpoint_url)
        
        log_step("Pipeline completed successfully!")
        return True
        
    except Exception as e:
        log_step(f"Pipeline failed with error: {str(e)}")
        return False

# Execute the pipeline
if __name__ == "__main__":
    pipeline_success = main()