## Test Extract Lineage

In [3]:
import requests
import json

from test_lib.help import read_spark_sql_source_code

# Define the API endpoint
url = 'http://127.0.0.1:6000/extract_data_lineage'

# Define the payload (input data)

SQL =  read_spark_sql_source_code('./data/spark_sql_1.txt')
payload = {
    "SQL": SQL
}

# Optionally, you can specify the LLM model
payload['llm_model'] = 'qwen2.5-72b-instruct' # 'qwen2.5-72b-instruct'

# Convert the payload to JSON format
headers = {'Content-Type': 'application/json'}

# Send the POST request
response = requests.post(url, data=json.dumps(payload), headers=headers)

# Check the response status code
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    print("Extracted Tables:", result['result'])
else:
    print("Failed to extract tables. Status code:", response.status_code)
    print("Response:", response.text)

Extracted Tables: 
{
  "spark_data_lineage": [
    {
      "target_result": "result_table1",
      "source_data": ["datas/agent.log"],
      "transformation": [
        "Read data from 'datas/agent.log' and split each line into words, creating an RDD of Rows with columns (t1, t2, t3).",
        "Filter the RDD to include only rows where the value in column t2 is even.",
        "Create a DataFrame 'df1' from the filtered RDD with schema (t1: StringType, t2: IntegerType, t3: IntegerType).",
        "Register 'df1' as a temporary view 't'.",
        "Execute SQL query to select columns t1, t2, t3 from view 't' and add a constant column 'tp' with value 'r', resulting in DataFrame 'result_table1'."
      ]
    },
    {
      "target_result": "result_table2",
      "source_data": ["datas/agent.log"],
      "transformation": [
        "Read data from 'datas/agent.log' and split each line into words, creating an RDD of Rows with columns (t1, t2, t3).",
        "Filter the RDD to include only 

## Compare the Difference

In [None]:
import requests
import json

# Define the API endpoint
url = 'http://127.0.0.1:6000/compare_spark_code'

# Define the payload (input data)
code_before = read_spark_sql_source_code('./data/spark_sql_before_1.txt')
code_after = read_spark_sql_source_code('./data/spark_sql_after_1.txt')

payload = {
    "original_sql_code": code_before,
    "revised_sql_code": code_after 
}

# Optionally, you can specify the LLM model
payload['llm_model'] = 'qwen2.5-72b-instruct' # 'qwen2.5-72b-instruct'

# Convert the payload to JSON format
headers = {'Content-Type': 'application/json'}

# Send the POST request
response = requests.post(url, data=json.dumps(payload), headers=headers)

# Check the response status code
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    print("Comparison Result:", result['result'])
else:
    print("Failed to compare SQL lineages. Status code:", response.status_code)
    print("Response:", response.text)

NameError: name 'read_spark_sql_source_code' is not defined