## PROMPT
Generate a pipeline for databricks that takes data from an API (https://api.spacexdata.com/v3/launches), performs a basic transformation task, and posts the result to another API (https://httpbin.org/post). Add prints for debugging the activities and display the execution time of the entire cycle

In [0]:
import requests
import json
import time
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("SpaceX Pipeline").getOrCreate()

# Start timer
start_time = time.time()

# Fetch data from SpaceX API
print("Fetching data from SpaceX API...")
response = requests.get('https://api.spacexdata.com/v3/launches')
print("Data fetched successfully.")

# Parse JSON response
data = json.loads(response.text)

# Create a Spark DataFrame from the data
df = spark.createDataFrame(data)

# Perform basic transformation task (e.g., convert launch date to timestamp)
from pyspark.sql.functions import col
df_transformed = df.withColumn("launch_date", col("launch_date_unix").cast("timestamp"))

# Print transformed data for debugging
print("Transformed Data:")
df_transformed.show()

# Post transformed data to httpbin.org
print("Posting data to httpbin.org...")
response = requests.post('https://httpbin.org/post', json=df_transformed.toPandas().to_dict(orient='records'))
print("Data posted successfully.")

# Print response from httpbin.org for debugging
print("Response from httpbin.org:")
print(response.text)

# Stop timer and print execution time
end_time = time.time()
print("Execution time: {:.2f} seconds".format(end_time - start_time))

# Stop Spark Session
spark.stop()


I send to LLAMA the output cell above

In [0]:
import requests
import json
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType, ArrayType, MapType

# Initialize Spark Session
spark = SparkSession.builder.appName("SpaceX Pipeline").getOrCreate()

# Start timer
start_time = time.time()

# Fetch data from SpaceX API
print("Fetching data from SpaceX API...")
response = requests.get('https://api.spacexdata.com/v3/launches')
print("Data fetched successfully.")

# Parse JSON response
data = json.loads(response.text)

# Define custom schema
schema = StructType([
    StructField("flight_number", IntegerType(), True),
    StructField("mission_name", StringType(), True),
    StructField("mission_id", ArrayType(StringType()), True),
    StructField("launch_year", StringType(), True),
    StructField("launch_date_unix", LongType(), True),
    StructField("launch_date_utc", StringType(), True),
    StructField("launch_window", LongType(), True),
    StructField("rocket", StructType([
        StructField("rocket_id", StringType(), True),
        StructField("rocket_name", StringType(), True),
        StructField("rocket_type", StringType(), True),
        StructField("first_stage", StructType([
            StructField("cores", ArrayType(StructType([
                StructField("core_serial", StringType(), True),
                StructField("flight", IntegerType(), True),
                StructField("block", IntegerType(), True),
                StructField("gridfins", BooleanType(), True),
                StructField("legs", BooleanType(), True),
                StructField("reused", BooleanType(), True),
                StructField("land_success", BooleanType(), True),
                StructField("landing_intent", BooleanType(), True),
                StructField("landing_type", StringType(), True),
                StructField("recovery_attempt", BooleanType(), True),
                StructField("recovery_ship", StructType([
                    StructField("name", StringType(), True),
                    StructField("home_port", StringType(), True)
                ]), True)
            ])), True),
            StructField("cores", ArrayType(StructType([
                StructField("core_serial", StringType(), True),
                StructField("flight", IntegerType(), True),
                StructField("block", IntegerType(), True),
                StructField("gridfins", BooleanType(), True),
                StructField("legs", BooleanType(), True),
                StructField("reused", BooleanType(), True),
                StructField("land_success", BooleanType(), True),
                StructField("landing_intent", BooleanType(), True),
                StructField("landing_type", StringType(), True),
                StructField("recovery_attempt", BooleanType(), True),
                StructField("recovery_ship", StructType([
                    StructField("name", StringType(), True),
                    StructField("home_port", StringType(), True)
                ]), True)
            ])), True)
        ])), True),
        StructField("second_stage", StructType([
            StructField("block", IntegerType(), True),
            StructField("payloads", ArrayType(StructType([
                StructField("payload_id", StringType(), True),
                StructField("norad_id", ArrayType(StringType()), True),
                StructField("reused", BooleanType(), True),
                StructField("customers", ArrayType(StringType()), True),
                StructField("nationality", StringType(), True),
                StructField("launch_mass_kg", DoubleType(), True),
                StructField("launch_mass_lbs", DoubleType(), True),
                StructField("orbit", StringType(), True),
                StructField("reference_system", StringType(), True),
                StructField("regime", StringType(), True),
                StructField("longitude", DoubleType(), True),
                StructField("semi_major_axis_km", DoubleType(), True),
                StructField("eccentricity", DoubleType(), True),
                StructField("periapsis_km", DoubleType(), True),
                StructField("apoapsis_km", DoubleType(), True),
                StructField("inclination_deg", DoubleType(), True),
                StructField("period_min", DoubleType(), True),
                StructField("lifespan_years", DoubleType(), True),
                StructField("epoch", StringType(), True),
                StructField("mean_motion", DoubleType(), True),
                StructField("raan", DoubleType(), True),
                StructField("arg_of_pericenter", DoubleType(), True


i try again to put the output

In [0]:
import requests
import json
import time
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("SpaceX Pipeline").getOrCreate()

# Start timer
start_time = time.time()

# Fetch data from SpaceX API
print("Fetching data from SpaceX API...")
response = requests.get('https://api.spacexdata.com/v3/launches')
print("Data fetched successfully.")

# Parse JSON response
data = json.loads(response.text)

# Create a Spark DataFrame from the data
df = spark.createDataFrame(data)

# Print schema for debugging
print("Schema:")
df.printSchema()

# Perform basic transformation task (e.g., convert launch date to timestamp)
from pyspark.sql.functions import col
df_transformed = df.withColumn("launch_date", col("launch_date_unix").cast("timestamp"))

# Print transformed data for debugging
print("Transformed Data:")
df_transformed.show()

# Post transformed data to httpbin.org
print("Posting data to httpbin.org...")
response = requests.post('https://httpbin.org/post', json=df_transformed.toPandas().to_dict(orient='records'))
print("Data posted successfully.")

# Print response from httpbin.org for debugging
print("Response from httpbin.org:")
print(response.text)

# Stop timer and print execution time
end_time = time.time()
print("Execution time: {:.2f} seconds".format(end_time - start_time))

# Stop Spark Session
spark.stop()


I try again

In [0]:
import requests
import json
import time
import pandas as pd
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("SpaceX Pipeline").getOrCreate()

# Start timer
start_time = time.time()

# Fetch data from SpaceX API
print("Fetching data from SpaceX API...")
response = requests.get('https://api.spacexdata.com/v3/launches')
print("Data fetched successfully.")

# Parse JSON response
data = json.loads(response.text)

# Create a Pandas DataFrame from the data
pdf = pd.DataFrame(data)

# Create a PySpark DataFrame from the Pandas DataFrame
df = spark.createDataFrame(pdf)

# Print schema for debugging
print("Schema:")
df.printSchema()

# Perform basic transformation task (e.g., convert launch date to timestamp)
from pyspark.sql.functions import col
df_transformed = df.withColumn("launch_date", col("launch_date_unix").cast("timestamp"))

# Print transformed data for debugging
print("Transformed Data:")
df_transformed.show()

# Post transformed data to httpbin.org
print("Posting data to httpbin.org...")
response = requests.post('https://httpbin.org/post', json=df_transformed.toPandas().to_dict(orient='records'))
print("Data posted successfully.")

# Print response from httpbin.org for debugging
print("Response from httpbin.org:")
print(response.text)

# Stop timer and print execution time
end_time = time.time()
print("Execution time: {:.2f} seconds".format(end_time - start_time))

# Stop Spark Session
spark.stop()


Here the code try to transform but can't perform the posting

In [0]:
import requests
import json
import time
import pandas as pd
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("SpaceX Pipeline").getOrCreate()

# Start timer
start_time = time.time()

# Fetch data from SpaceX API
print("Fetching data from SpaceX API...")
response = requests.get('https://api.spacexdata.com/v3/launches')
print("Data fetched successfully.")

# Parse JSON response
data = json.loads(response.text)

# Create a Pandas DataFrame from the data
pdf = pd.DataFrame(data)

# Create a PySpark DataFrame from the Pandas DataFrame
df = spark.createDataFrame(pdf)

# Print schema for debugging
print("Schema:")
df.printSchema()

# Perform basic transformation task (e.g., convert launch date to timestamp)
from pyspark.sql.functions import col
df_transformed = df.withColumn("launch_date", col("launch_date_unix").cast("timestamp"))

# Print transformed data for debugging
print("Transformed Data:")
df_transformed.show()

# Post transformed data to httpbin.org
print("Posting data to httpbin.org...")
data_to_post = df_transformed.toPandas().to_dict(orient='records')
for item in data_to_post:
    for key, value in item.items():
        if isinstance(value, (list, tuple)):
            item[key] = list(value)
        elif isinstance(value, dict):
            for k, v in value.items():
                if isinstance(v, (list, tuple)):
                    value[k] = list(v)
response = requests.post('https://httpbin.org/post', json=data_to_post)
print("Data posted successfully.")

# Print response from httpbin.org for debugging
print("Response from httpbin.org:")
print(response.text)

# Stop timer and print execution time
end_time = time.time()
print("Execution time: {:.2f} seconds".format(end_time - start_time))

# Stop Spark Session
spark.stop()


TRY AND CAN'T RESOLVE THESE ISSUES