## Test Extract Lineage

In [33]:
import requests
import json

from test_lib.help import read_spark_sql_source_code

# Define the API endpoint
url = 'http://127.0.0.1:5000/extract_data_lineage'

# Define the payload (input data)

SQL =  read_spark_sql_source_code('./data/spark_sql_1.txt')
payload = {
    "SQL": SQL
}

# Optionally, you can specify the LLM model
payload['llm_model'] = 'qwen2.5-7b-instruct' # 'qwen2.5-72b-instruct'

# Convert the payload to JSON format
headers = {'Content-Type': 'application/json'}

# Send the POST request
response = requests.post(url, data=json.dumps(payload), headers=headers)

# Check the response status code
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    print("Extracted Tables:", result['result'])
else:
    print("Failed to extract tables. Status code:", response.status_code)
    print("Response:", response.text)

Extracted Tables: 
[
  {
    "target_result": "result_table1",
    "source_data": ["datas/agent.log"],
    "transformation": [
      "From 'datas/agent.log', filter rows where column 't2' is even to create tableRDD1.",
      "Create DataFrame df1 from tableRDD1 with schema (t1 StringType, t2 IntegerType, t3 IntegerType).",
      "Create temporary view 't' from df1.",
      "Select columns t1, t2, t3, and add a constant column 'tp' with value 'r' to generate result_table1."
    ]
  },
  {
    "target_result": "result_table2",
    "source_data": ["datas/agent.log"],
    "transformation": [
      "From 'datas/agent.log', filter rows where column 't3' is even to create tableRDD2.",
      "Create DataFrame df2 from tableRDD2 with schema (r1 StringType, r2 IntegerType, r3 IntegerType).",
      "Create temporary view 'r' from df2.",
      "Select columns r1, r2, r3, and add a constant column 'tp' with value 'r' to generate result_table2."
    ]
  }
]



## Compare the Difference

In [34]:
import requests
import json

# Define the API endpoint
url = 'http://127.0.0.1:5000/compare_spark_code'

# Define the payload (input data)
code_before = read_spark_sql_source_code('./data/spark_sql_before_1.txt')
code_after = read_spark_sql_source_code('./data/spark_sql_after_1.txt')

payload = {
    "original_sql_code": code_before,
    "revised_sql_code": code_after 
}

# Optionally, you can specify the LLM model
payload['llm_model'] = 'qwen2.5-7b-instruct' # 'qwen2.5-72b-instruct'

# Convert the payload to JSON format
headers = {'Content-Type': 'application/json'}

# Send the POST request
response = requests.post(url, data=json.dumps(payload), headers=headers)

# Check the response status code
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    print("Comparison Result:", result['result'])
else:
    print("Failed to compare SQL lineages. Status code:", response.status_code)
    print("Response:", response.text)

Comparison Result: 
{
  "spark_data_lineage": {
    "original_target_result": "t",
    "original_source_data": ["datas/agent.log"],
    "revised_target_result": "r",
    "revised_source_data": ["datas/agent.log"],
    "transformation_change": [
      "Change 1: In the original code, `tableRDD1` was created using `rowRDD.filter(row => row.getInt(1) % 2 == 0)`, but in the revised code, `tableRDD2` is created using `rowRDD.filter(row => row.getInt(2) % 2 == 0)`. This change alters the filtering condition based on the second column instead of the first.",
      "Change 2: The original code attempted to create a temporary view `t` with `df1`, but it was commented out. In the revised code, a new temporary view `r` is created with `df2` using the filtered `tableRDD2`.",
      "Change 3: The SQL query in the original code was `select t1 as t1, t2 as t2, t3 as t3, 'r' as tp from t`, targeting the temporary view `t`. In the revised code, the SQL query is `select r1 as t1, r2 as t2, r3 as t3, 'r'