## Test Extract Lineage

In [2]:
import requests
import json

from test_lib.help import read_spark_sql_source_code

# Define the API endpoint
url = 'http://127.0.0.1:5000/extract_data_lineage'

# Define the payload (input data)

SQL =  read_spark_sql_source_code('./data/spark_sql_1.txt')
payload = {
    "SQL": SQL
}

# Optionally, you can specify the LLM model
payload['llm_model'] = 'qwen2.5-72b-instruct' # 'qwen2.5-72b-instruct'

# Convert the payload to JSON format
headers = {'Content-Type': 'application/json'}

# Send the POST request
response = requests.post(url, data=json.dumps(payload), headers=headers)

# Check the response status code
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    print("Extracted Tables:", result['result'])
else:
    print("Failed to extract tables. Status code:", response.status_code)
    print("Response:", response.text)

Extracted Tables: 
{
  "spark_data_lineage": [
    {
      "target_result": "result_table1",
      "source_data": ["datas/agent.log"],
      "transformation": [
        "Read the log file 'datas/agent.log' and split each line into three columns (t1, t2, t3) to create an RDD of Rows.",
        "Filter the RDD to keep only rows where the value in column t2 is even, creating tableRDD1.",
        "Convert tableRDD1 into a DataFrame df1 with schema (t1: StringType, t2: IntegerType, t3: IntegerType).",
        "Register df1 as a temporary view 't'.",
        "Execute SQL query to select all columns from view 't' and add a new column 'tp' with a constant value 'r', creating result_table1."
      ]
    },
    {
      "target_result": "result_table2",
      "source_data": ["datas/agent.log"],
      "transformation": [
        "Read the log file 'datas/agent.log' and split each line into three columns (t1, t2, t3) to create an RDD of Rows.",
        "Filter the RDD to keep only rows where the va

## Compare the Difference

In [3]:
import requests
import json

# Define the API endpoint
url = 'http://127.0.0.1:5000/compare_spark_code'

# Define the payload (input data)
code_before = read_spark_sql_source_code('./data/spark_sql_before_1.txt')
code_after = read_spark_sql_source_code('./data/spark_sql_after_1.txt')

payload = {
    "original_sql_code": code_before,
    "revised_sql_code": code_after 
}

# Optionally, you can specify the LLM model
payload['llm_model'] = 'qwen2.5-72b-instruct' # 'qwen2.5-72b-instruct'

# Convert the payload to JSON format
headers = {'Content-Type': 'application/json'}

# Send the POST request
response = requests.post(url, data=json.dumps(payload), headers=headers)

# Check the response status code
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    print("Comparison Result:", result['result'])
else:
    print("Failed to compare SQL lineages. Status code:", response.status_code)
    print("Response:", response.text)

Comparison Result: 
{
  "spark_data_lineage": [
    {
      "original_target_result": "result_table1",
      "original_source_data": ["datas/agent.log"],
      "revised_target_result": "result_table2",
      "revised_source_data": ["datas/agent.log"],
      "transformation_change": [
        "change 1: In the original code, the `tableRDD1` is created by filtering rows where `row.getInt(1) % 2 == 0`. In the revised code, this filter is commented out and `tableRDD2` is created by filtering rows where `row.getInt(2) % 2 == 0`.",
        "change 2: In the original code, `df1` is created from `tableRDD1` and registered as a temporary view `t`. In the revised code, `df1` is commented out and `df2` is created from `tableRDD2` and registered as a temporary view `r`.",
        "change 3: In the original code, the SQL query selects data from the temporary view `t`. In the revised code, the SQL query selects data from the temporary view `r`."
      ]
    }
  ]
}



In [9]:
import requests
import json

# Define the API endpoint
url = 'http://127.0.0.1:5000/list_table_from_spark_code'

# Define the payload (input data)
code = """
CREATE TABLE student_bucket
    USING parquet
    CLUSTERED BY (id) INTO 4 buckets (
    WITH tmpTable AS (
        SELECT * FROM student WHERE id > 100
    )
    SELECT * FROM tmpTable
);

DROP TABLE employeetable;
"""

payload = {
    "spark_sql_code": code
}

# Optionally, you can specify the LLM model
payload['llm_model'] = 'qwen2.5-72b-instruct' # 'qwen2.5-72b-instruct'

# Convert the payload to JSON format
headers = {'Content-Type': 'application/json'}

# Send the POST request
response = requests.post(url, data=json.dumps(payload), headers=headers)

# Check the response status code
if response.status_code == 200:
    # Parse the JSON response
    result = response.json()
    print("Comparison Result:", result['result'])
else:
    print("Failed to compare SQL lineages. Status code:", response.status_code)
    print("Response:", response.text)

Comparison Result: 
{
  "tables": {
    "row_unchanged_table": [],
    "row_changed_table": ["student"],
    "created_table": ["student_bucket"],
    "deleted_table": ["employeetable"]
  }
}

