## PROMPT
Generate a pipeline for databricks that takes data from an API (https://api.spacexdata.com/v3/launches), performs a basic transformation task, and posts the result to another API (https://httpbin.org/post). Add prints for debugging the activities and display the execution time of the entire cycle

In [0]:
import requests
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Start timing the execution
start_time = time.time()

# Step 1: Fetch data from the SpaceX API
def fetch_spacex_data():
    url = "https://api.spacexdata.com/v3/launches"
    try:
        print("Fetching data from SpaceX API...")
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)
        print("Data fetched successfully.")
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from SpaceX API: {e}")
        raise

# Step 2: Perform a basic transformation using PySpark
def transform_data(data):
    print("Transforming data...")
    # Convert the JSON data into a Spark DataFrame
    spark = SparkSession.builder.appName("SpaceXTransformation").getOrCreate()
    df = spark.createDataFrame(data)
    
    # Example transformation: Filter launches where the rocket name contains "Falcon"
    filtered_df = df.filter(col("rocket.rocket_name").like("%Falcon%"))
    
    # Select specific columns for simplicity
    transformed_df = filtered_df.select(
        col("flight_number"),
        col("mission_name"),
        col("launch_year"),
        col("rocket.rocket_name").alias("rocket_name")
    )
    
    print("Transformation completed.")
    return transformed_df.toPandas().to_dict(orient="records")  # Convert back to JSON for posting

# Step 3: Post the transformed data to the target API
def post_data_to_api(transformed_data):
    url = "https://httpbin.org/post"
    try:
        print("Posting transformed data to target API...")
        response = requests.post(url, json=transformed_data)
        response.raise_for_status()
        print("Data posted successfully.")
        print("Response from target API:", response.json())
    except requests.exceptions.RequestException as e:
        print(f"Error posting data to target API: {e}")
        raise

# Main pipeline execution
if __name__ == "__main__":
    try:
        # Fetch data from SpaceX API
        spacex_data = fetch_spacex_data()
        
        # Transform the fetched data
        transformed_data = transform_data(spacex_data)
        
        # Post the transformed data to the target API
        post_data_to_api(transformed_data)
        
    except Exception as e:
        print(f"Pipeline failed with error: {e}")
    
    # Calculate and display the total execution time
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Pipeline executed in {execution_time:.2f} seconds.")
    

after the error:
Fetching data from SpaceX API...
Data fetched successfully.
Transforming data...
Pipeline failed with error: [CANNOT_INFER_TYPE_FOR_FIELD] Unable to infer the type of the field `links`.
Pipeline executed in 0.92 seconds.

In [0]:
import requests
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Start timing the execution
start_time = time.time()

# Step 1: Fetch data from the SpaceX API
def fetch_spacex_data():
    url = "https://api.spacexdata.com/v3/launches"
    try:
        print("Fetching data from SpaceX API...")
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)
        print("Data fetched successfully.")
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from SpaceX API: {e}")
        raise

# Step 2: Perform a basic transformation using PySpark
def transform_data(data):
    print("Transforming data...")
    # Convert the JSON data into a Spark DataFrame with a simplified schema
    spark = SparkSession.builder.appName("SpaceXTransformation").getOrCreate()
    
    # Flatten the nested structure by selecting specific fields
    flattened_data = [
        {
            "flight_number": item["flight_number"],
            "mission_name": item["mission_name"],
            "launch_year": item["launch_year"],
            "rocket_name": item["rocket"]["rocket_name"],
            "video_link": item["links"].get("video_link", None)  # Handle nested links field
        }
        for item in data
    ]
    
    df = spark.createDataFrame(flattened_data)
    
    # Example transformation: Filter launches where the rocket name contains "Falcon"
    filtered_df = df.filter(col("rocket_name").like("%Falcon%"))
    
    print("Transformation completed.")
    return filtered_df.toPandas().to_dict(orient="records")  # Convert back to JSON for posting

# Step 3: Post the transformed data to the target API
def post_data_to_api(transformed_data):
    url = "https://httpbin.org/post"
    try:
        print("Posting transformed data to target API...")
        response = requests.post(url, json=transformed_data)
        response.raise_for_status()
        print("Data posted successfully.")
        print("Response from target API:", response.json())
    except requests.exceptions.RequestException as e:
        print(f"Error posting data to target API: {e}")
        raise

# Main pipeline execution
if __name__ == "__main__":
    try:
        # Fetch data from SpaceX API
        spacex_data = fetch_spacex_data()
        
        # Transform the fetched data
        transformed_data = transform_data(spacex_data)
        
        # Post the transformed data to the target API
        post_data_to_api(transformed_data)
        
    except Exception as e:
        print(f"Pipeline failed with error: {e}")
    
    # Calculate and display the total execution time
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Pipeline executed in {execution_time:.2f} seconds.")